diff --git "a/static/eval_results/Default/all_model_keywords_stats.json" "b/static/eval_results/Default/all_model_keywords_stats.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/Default/all_model_keywords_stats.json" @@ -0,0 +1,5150 @@ +{ + "GPT_4o": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5630758211022604 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6216411634729735 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.616018277142757 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5823101249498799 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.44177544539510955 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.6345458069232931 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6795263157894738 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5514924675940659 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.39435038953269674 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.22934807257231926 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.608083455060831 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.491325251564869 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4999089647103332 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5641404607063637 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5613545677222056 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.47760591698367955 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5388690453811203 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.48037685656449847 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5994159671881645 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.44606605087301393 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.6274371950293718 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5448877153826162 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.4751133786848073 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5343350103400748 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5672657028463585 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5315979872161023 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4500928191484624 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.4908653289106883 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.7056027785545881 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.33202130899313653 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5032849161169843 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5510350848991218 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6095778863474799 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5283797185155754 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.6135723164021851 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.44047720383044436 + } + } + }, + "Gemini_1.5_pro_002": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5201947642961418 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.49947304390648534 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5512750115216515 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5467324805307577 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.425969084163906 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5750369536204262 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6982330827067671 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.513647745999633 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.3845337030093212 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23899503258223884 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4592162957187749 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4292353723689881 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4869625906903554 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5584779204331461 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5500305447809621 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4292127751495457 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.44896309957892694 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.44137714463131966 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5146447350354234 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4688623462674191 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5580414823700747 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5538255562099124 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.39066515495086923 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5295721925617263 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5034399620483027 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.5028718355967439 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4885398161821004 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.4553778359922855 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5378983862471568 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3335324339429373 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.43465181771633377 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5250631828331306 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5821004797173627 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5124355410095621 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5721991184410764 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.41210885517904977 + } + } + }, + "Gemini_1.5_flash_002": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.46250942866818673 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4317914359988347 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.49775198805427967 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5098686082319499 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.34393279682972117 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5594391803821158 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6380250626566416 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.44816564352475535 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.34510790215980036 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.18973764406890803 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3836737169374586 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3598139859097534 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.4013870708864889 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4903530871753026 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5051202896842343 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5166044655846657 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.3849084036535956 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3869438864407766 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3962715194192418 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.44793686445264996 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3704146726364947 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5448638967636353 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.47829883834573317 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.33669690098261523 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4300676062024303 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4427944359714585 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4903530871753026 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.42346517633403413 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.41994719346489817 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4627701625196691 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2517485212411566 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.40372378342017806 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.4799408254775632 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6010361821632402 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4569546533897065 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.511590428993871 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.33710867194177685 + } + } + }, + "Claude_3.5": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5405089647404562 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6046357055234819 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5712627152062051 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5450038475783499 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.4767692987630454 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5756126284078804 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6969774436090224 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5278843049497918 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.4082144793870471 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.23803578664609892 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5637906302497772 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4795267886975966 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.525848282456283 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5699094130430454 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5096772701625744 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.4429640420975014 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5066797418318023 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.4926030136534706 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5278127103234661 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4490020843308984 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5838224169821388 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5456152399978661 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.46300075585789874 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5292494759360522 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5373019912310933 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.508735695828719 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4422556748863689 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.49311554035078103 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.6593763006847053 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.3382015835012861 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5194010220575684 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.532329797132399 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5808831682303479 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.513474611293123 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5507075880782885 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.47461998432626556 + } + } + }, + "Claude_3.5_new": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.5690045172520449 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.6220681231036606 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.6077980666415158 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5511440615639541 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.4885536652013625 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5908204006544897 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6569473684210526 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.5486763511384175 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.4315385951907387 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.2909419331017877 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.6048192628845258 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.48924295292319175 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.556418710368288 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4946691340754988 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.5558756390298104 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.5425198547046186 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.44210335381541843 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.5187252051932875 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.5071121107460066 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5387340524651681 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4824302644151348 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.6242798397166945 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5782691045270721 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.4630277507828528 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5914338446093256 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.5636254729390459 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4946691340754988 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.4828123870640382 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.48756636014597515 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.6590137441693218 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.39901670035164916 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.5166853031535193 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5561634744977417 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.6123769274172342 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.5512015158810595 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.565796566886933 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.4763267502912362 + } + } + }, + "GPT_4o_mini": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.44928744961868194 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.48842488118273475 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.5152626716886682 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.4672966076116977 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3406008235342885 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5572281917334303 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6902380952380953 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4189154010048976 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2943206715105082 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.19422793560945503 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.4700389569079038 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.3624496929166193 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.38946844562183286 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.47569921440672464 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.46468618797917643 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.29410984789062117 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.41174000979649644 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.38893151244736324 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.44244772638735347 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3629944944697668 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5713834131825314 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.39874839531459466 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3359977324263039 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.4260710116168476 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.46322170353087255 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.45508480503584553 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24651576711552803 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3697506340557095 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5640948591986592 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2420320329702607 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3458483931206892 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.43544861040322835 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5176671720617656 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.3554299482098288 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5398829253460956 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.32918280841495845 + } + } + }, + "Qwen2_VL_72B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.49774395003470484 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.538829507114716 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.534480883952292 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.5092565754998357 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.3776739609562984 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5676174603436022 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.60496992481203 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.4633019068994453 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.35105970797600183 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.2201150812944581 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.5356361790015363 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.4289777675393297 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.42094543671351287 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.49943888306036405 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.507967430369507 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.495761900914191 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.36212605501536715 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.4444770652190341 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.44584364394901616 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.5098505660529429 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.4027115384266939 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.5157810622684265 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.5199940976484408 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.3100812547241119 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.5364299983756791 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.4908605783408196 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.49943888306036405 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.36691704884033916 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.45169664275718613 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.5748195752273694 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.31245958897213383 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.4372517645050852 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.5343715685033166 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4968249101570037 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.4488852456563113 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.5162919233645259 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.31157492395100744 + } + } + }, + "Qwen2_VL_7B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.370836862933556 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.39973692484032347 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.4012977216731433 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.410990923097227 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2818925976996871 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.493608784197707 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5215889724310777 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.33309401517140946 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.27564756843599875 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.1473690605854188 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.3814353882556586 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2896392967775049 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3223325179806271 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.4111189310485516 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.34825121621909577 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.4047366473438155 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.262166593895899 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3403519326516044 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3420538306638288 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.35162604166912687 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.32665673520415817 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.3909745200389741 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.39898011714302023 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.19415154950869234 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.37301502633138073 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3761693199448087 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.4111189310485516 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.26429868057315387 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.33008667137716374 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.42660307298355216 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2003871750665659 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.3270187644950453 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.39864841947520724 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4245693009859056 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.29880557491654197 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.42766370932167636 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.25562039051316643 + } + } + }, + "llava_onevision_72B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3615741356043519 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.282401662313336 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.36653344218973427 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.42146038539739283 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2951434804409883 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.478119286755779 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.6005438596491229 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.31663222188988865 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.29633645022129285 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.13872280436872364 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.23294708136735856 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2126914943750874 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.34566020099204997 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4446001874842145 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.4401364830377099 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.42429297143518147 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.23897262553543516 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.28614732096244 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.25872873777911126 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.370724080249463 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3065719940769206 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4293132525502993 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3986052416087927 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20730347694633405 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.27911174307216713 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3481968601113118 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.4446001874842145 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.25013213032747944 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.34156793747875674 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.30653989171354723 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.18168666652660437 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.23240790940031927 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.38316803441883945 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.4807891958712894 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.31702495228966576 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.4358874880224115 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.31588468105075895 + } + } + }, + "llava_onevision_7B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2524786809911341 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.1902376706945491 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.255069390206439 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.29981286990552625 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18973491465938852 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.36842322314565323 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.44998746867167916 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.2445135206648208 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.21802943568344288 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06658775725427067 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.1466163383815089 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.13297395577964055 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.24236719143449742 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.30985943541023103 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3199731020402028 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3258716730180874 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.13043163858789789 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.20209776978059824 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.18285692568564196 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.25384794412815426 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2200472229099345 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3127341248874411 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2802999516721972 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1476473922902494 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.13787962981142515 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25459683619676365 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.30985943541023103 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1778991941079372 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2410111891690358 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.19274192395698486 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.09846926279075068 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.15189414475467605 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2845922887108415 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3600079950628582 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.23654776813656775 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3271805711561501 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.22080546908673507 + } + } + }, + "InternVL2_76B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.38191947207402666 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.4103649605406274 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.4341802504488193 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.42654142415639185 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2975890791763991 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.5257357753421337 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5779473684210527 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.33287081421166276 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2949505390920417 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.17036496432397477 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.362195416198664 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.31396468806559114 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3473756113126343 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.44982107744035305 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.42686510293379315 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.2868239162778749 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3603288661353782 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3465926907358438 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3943337471922549 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.29244088978470345 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.45822072478616577 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3879326330400817 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.20309901738473166 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.34490184941501867 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.41372274360003347 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.395893002855977 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.24403942809507134 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.3152784738582855 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4290949563510903 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.2132321995754061 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2953329718984368 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.4201902630957567 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.47409276729986083 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.30014798153766264 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.46253164682269177 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.2868813944130515 + } + } + }, + "InternVL2_8B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2817247716997634 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.2794121858805306 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2511, + "tasks": [], + "average_score": 0.31918687243853283 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2469, + "tasks": [], + "average_score": 0.325593535916075 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.24118253695139918 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.39684007367798446 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4700852130325815 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.27052668526005397 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2439, + "tasks": [], + "average_score": 0.23189345356483618 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08260405712900723 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.2277532691786533 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.2013779290163996 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.2804429603269583 + }, + "Videos": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2942163420306113 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.33787327172644077 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.10933317885944857 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.24944408255581693 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.25203287826995174 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.27414636444623874 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.22381302045502052 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1456, + "tasks": [], + "average_score": 0.3537549824897016 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.30261189962428353 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.15434618291761149 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.19814032315010577 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.30046383040641306 + }, + "video": { + "count": 43, + "num_samples": 700, + "tasks": [], + "average_score": 0.34791358240562653 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17725087609332119 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2532272454839157 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.29096771640715396 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.12166926715781588 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.24700310231619527 + }, + "Perception": { + "count": 145, + "num_samples": 2315, + "tasks": [], + "average_score": 0.3205471121079154 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3995660275981844 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.24614711281861912 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3393895915929317 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.22078333222564453 + } + } + }, + "MiniCPM_v2.6": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2604969133146555 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.24828453993935928 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2987613496312298 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.31808788094038193 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.18281637763548025 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4073231792632807 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.48798245614035085 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.23723675736151562 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.1968926733821904 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08735883237069725 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.21153173491931837 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.18639148159043903 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.21578309681746147 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3096882575625531 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.31628986040092516 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.0755920550038197 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.23302306387939006 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.17775369699584467 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2551275278138797 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.20833171754655547 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.36473950920880716 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.293386806641223 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.13955971277399848 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.23499726844115643 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2625611181730622 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.3527537836840162 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17888270664238365 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.22288678972853282 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.26614948589295767 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.11693267119342445 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.15342045420318667 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.2910511308735813 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3777897246686755 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.25714862989687987 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.33187792895542906 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.16493399805627715 + } + } + }, + "Phi-3.5-vision": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.2551037902226636 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.24734930136620975 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2864612416413776 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3049602749093698 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.21653804346780042 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.36823084724842464 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.46663157894736845 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.24145330077248778 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2154692063816354 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.08944481289041872 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.18587661796707747 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.17497379027990792 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.26053460127801603 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24669318645450836 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2786226802221388 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.34091066308972107 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.15444746077692828 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.21711219915973207 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2138304528863496 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2572371188897671 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.21409351002477045 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.365192668303297 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.25960269434727634 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.12546296296296297 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.14174374624685185 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.2776898347355035 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.24669318645450836 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.20168001345379397 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2850550871176333 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.22277777000798116 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08928724806836039 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.219367263034246 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.31585879714366544 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3945898792928062 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.21925278489551242 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.33264696401038385 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.17575913004138646 + } + } + }, + "Pixtral_12B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.34602671066871027 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.3764652079852679 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.38183869685317606 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3776679463596073 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2828575553466608 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4190587833823822 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5687919799498747 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.32813540763467464 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2677293131171651 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.10591240329992047 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.30581019415764066 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.28832738144368647 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.3223299098375932 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.409643099998057 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.37450808136321684 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.37068890840142343 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.24009431093278263 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3071379066920702 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.31782992537086313 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.3639544140938305 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.32073418701669026 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4166613092238043 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3008126415966517 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.19743008314436883 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.16370884074367903 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.37086966536142313 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.409643099998057 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.2575699315401612 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.310449170121381 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.4285286292013588 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.13622980866275425 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2572414987500377 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.388749951743596 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.5020540387409291 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.31301986568151985 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.38094471423409354 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.24222628640267738 + } + } + }, + "Llama_3_2_11B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.1907604552173455 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.14280015951776653 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.1960311445935766 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.22399113135844315 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.13303760019716085 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.323153603297999 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4260501253132832 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.1770852858056774 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.15366454315378308 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06563884729522687 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.11886347847341794 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11489351406848371 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.1693681214060816 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.2520175802062012 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.24806929522702081 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.06418655520777307 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.12349256529641485 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.16374180545556977 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.1576236804437753 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.15014439824913947 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3003142292328822 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.19270157739425633 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1463246409674981 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0732004839476103 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.19579907898674231 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2123769209846321 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1351857051327849 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.18586695387250338 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.17288724679416761 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08100042975820579 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.0575426944971537 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.19853488174071646 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.254316961351997 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.162801811963855 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.28055776664538923 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.13937853323074623 + } + } + }, + "Idefics3": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.14507788965553362 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.11641535161320743 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.17255583910766542 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.14745217246476708 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.1331851390883708 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.19221534222332276 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.28640852130325817 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.17906399043310475 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.10192930055370109 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.04211916597550756 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.10126271262360581 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11407926733108291 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.16225217317782772 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.16181866973635636 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.1839408679813373 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.14933801491626408 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.0395540896656236 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.13979628998424784 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.1062779093260333 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.07053056796593082 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.09790172378722654 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.2987797010800956 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.11588163814170001 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.1008692365835223 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.09308121224497533 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.14757589734485796 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.16181866973635636 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.12217834249866026 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.12276246278377517 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.14743542163139847 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.05354869594691955 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.09065540194572455 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.1463280929280822 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.14564374862578883 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.22748773785486257 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.17647756032677067 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.13168972973651977 + } + } + }, + "Aria": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.3264829094772722 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.35712138797286674 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.4004806395853317 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.3783082688258977 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.27628131703993153 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.4942870225393938 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.5811228070175439 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.3279996334048362 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.2481896092177717 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.11945216302285933 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.2830308005758272 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.27833423130489043 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.32371820359400666 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.42875359425696014 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.3612041984219992 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.37290568595471846 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.19554976321164697 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.3092653492193887 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.3043751656077328 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2930015244066511 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.3092167834876797 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.4523860109667709 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.3277812604542708 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.21139455782312927 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.2711617723374526 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3576735443060994 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.42875359425696014 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.19839956701033565 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.27267126872569447 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.38321397541649777 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.14301905320436192 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2849545194421855 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.3779947327886569 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.39678729061309725 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.29682445889316517 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.4096377585306089 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.26194160419181234 + } + } + }, + "NVLM": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.24033557047857043 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.32154059695494047 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.2937052996171993 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.22845955700594492 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.2639741933075709 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.40870864071047447 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4555238095238095 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.25785191641267197 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.15679681195908274 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.0672259242345112 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.23922823287047076 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.21734036617042948 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.30313485498585124 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.0 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.34726189956094355 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.3264757655296162 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.056894830390305184 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.22868389095927066 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.2788963949121424 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.2787764976961992 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.23349712171444964 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.3215948035793096 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.18487055428231897 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.0 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.0 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.3680809151131777 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.0 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.03838410364145658 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.2325581694709435 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.22773778915303383 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.08048160660797504 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.2390024647851972 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.30211261814126533 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.18857142857142856 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.24908307640275493 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.3724877947012685 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.24529601154794037 + } + } + }, + "InternVL2_2B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.14491178903291552 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.12126906675624163 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.16912754929321935 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.18542274192083463 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.13923308734553164 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.23992252224543772 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.3420927318295739 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.14807577209152425 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.13036555933925006 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.01727799227799228 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.057021136657850864 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.10504085961245285 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.1625198552182714 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.18999779001767986 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.1487677475708977 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2011727338536935 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.11886936592818943 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.1131404778887607 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.05739750616837997 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.15465451663650032 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.16044698450090833 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.21429521387724249 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.2128614316540013 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.03658352229780801 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.05757839721254354 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.15225683687839608 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.18999779001767986 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17677460549936644 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.158165588340436 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.08722661966805 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.04102853815875594 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.11264043251709285 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.17001758160301803 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.3332891958712894 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.1686125516807394 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.21169137106199268 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.10975764217070672 + } + } + }, + "Qwen2_VL_2B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.22236161923122505 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.23701014663017753 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.25669221785292334 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.26526414975225454 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.17623548305581763 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.31250702198481506 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.4140676691729323 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.20802820480076603 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.17320633068307653 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.06209506566980099 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.190837839372028 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.16287824421269087 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.19640906475019812 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2520741776922928 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.24883076673424442 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.2877316297453947 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.13398525561847363 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.1624451002757208 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.20960092816529263 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.19986806708136184 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.2201024015934558 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.30248748033122763 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.256631742010999 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.07681405895691609 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.10526691703628158 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.25018977062352593 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.2520741776922928 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.17435940889565366 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.21286783416184518 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.2521972668785968 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.06967138760493456 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.16996250112948405 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.27603334911345223 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.31002436092347696 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.21061929716065056 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.2656728023444808 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.16356158787929762 + } + } + }, + "Aquila_VL_2B": { + "skills": { + "Object Recognition and Classification": { + "count": 303, + "num_samples": 4755, + "tasks": [], + "average_score": 0.18420666660337692 + }, + "Text Recognition (OCR)": { + "count": 137, + "num_samples": 2239, + "tasks": [], + "average_score": 0.12395530240359122 + }, + "Language Understanding and Generation": { + "count": 154, + "num_samples": 2509, + "tasks": [], + "average_score": 0.17924536722051596 + }, + "Scene and Event Understanding": { + "count": 154, + "num_samples": 2467, + "tasks": [], + "average_score": 0.220108610660707 + }, + "Mathematical and Logical Reasoning": { + "count": 109, + "num_samples": 1910, + "tasks": [], + "average_score": 0.1680749869910155 + }, + "Commonsense and Social Reasoning": { + "count": 51, + "num_samples": 855, + "tasks": [], + "average_score": 0.26630477322766793 + }, + "Ethical and Safety Reasoning": { + "count": 15, + "num_samples": 245, + "tasks": [], + "average_score": 0.35152130325814535 + }, + "Domain-Specific Knowledge and Skills": { + "count": 77, + "num_samples": 1386, + "tasks": [], + "average_score": 0.1857154485444521 + }, + "Spatial and Temporal Reasoning": { + "count": 152, + "num_samples": 2437, + "tasks": [], + "average_score": 0.1616397700608881 + }, + "Planning and Decision Making": { + "count": 37, + "num_samples": 577, + "tasks": [], + "average_score": 0.044513236949565 + } + }, + "input_format": { + "User Interface Screenshots": { + "count": 93, + "num_samples": 1517, + "tasks": [], + "average_score": 0.07480350331940272 + }, + "Text-Based Images and Documents": { + "count": 82, + "num_samples": 1294, + "tasks": [], + "average_score": 0.11444110320621242 + }, + "Diagrams and Data Visualizations": { + "count": 101, + "num_samples": 1718, + "tasks": [], + "average_score": 0.19412275574929044 + }, + "Videos": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.21367350061199514 + }, + "Artistic and Creative Content": { + "count": 32, + "num_samples": 541, + "tasks": [], + "average_score": 0.19717811128156643 + }, + "Photographs": { + "count": 143, + "num_samples": 2248, + "tasks": [], + "average_score": 0.24620947964695974 + }, + "3D Models and Aerial Imagery": { + "count": 11, + "num_samples": 169, + "tasks": [], + "average_score": 0.10131259529340846 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 98, + "num_samples": 1514, + "tasks": [], + "average_score": 0.11925340914357861 + }, + "structured_output": { + "count": 110, + "num_samples": 1714, + "tasks": [], + "average_score": 0.123417109500157 + }, + "exact_text": { + "count": 83, + "num_samples": 1278, + "tasks": [], + "average_score": 0.18474924824567768 + }, + "numerical_data": { + "count": 49, + "num_samples": 862, + "tasks": [], + "average_score": 0.19908864029107046 + }, + "open_ended_output": { + "count": 80, + "num_samples": 1454, + "tasks": [], + "average_score": 0.23278612647548963 + }, + "multiple_choice": { + "count": 85, + "num_samples": 1363, + "tasks": [], + "average_score": 0.22108484223035305 + } + }, + "input_num": { + "6-8 images": { + "count": 21, + "num_samples": 314, + "tasks": [], + "average_score": 0.11057256235827662 + }, + "9-image or more": { + "count": 41, + "num_samples": 623, + "tasks": [], + "average_score": 0.011631871744697361 + }, + "1-image": { + "count": 315, + "num_samples": 5228, + "tasks": [], + "average_score": 0.18240049845355885 + }, + "video": { + "count": 43, + "num_samples": 698, + "tasks": [], + "average_score": 0.21367350061199514 + }, + "4-5 images": { + "count": 34, + "num_samples": 520, + "tasks": [], + "average_score": 0.1898373110613516 + }, + "2-3 images": { + "count": 51, + "num_samples": 802, + "tasks": [], + "average_score": 0.23274180707905315 + } + }, + "app": { + "Information_Extraction": { + "count": 72, + "num_samples": 1124, + "tasks": [], + "average_score": 0.09484068019620011 + }, + "Planning": { + "count": 78, + "num_samples": 1239, + "tasks": [], + "average_score": 0.05864269260897992 + }, + "Coding": { + "count": 31, + "num_samples": 474, + "tasks": [], + "average_score": 0.13323092677931386 + }, + "Perception": { + "count": 145, + "num_samples": 2313, + "tasks": [], + "average_score": 0.20714098741611 + }, + "Metrics": { + "count": 20, + "num_samples": 309, + "tasks": [], + "average_score": 0.2932627505936196 + }, + "Science": { + "count": 29, + "num_samples": 574, + "tasks": [], + "average_score": 0.21075421274487907 + }, + "Knowledge": { + "count": 97, + "num_samples": 1605, + "tasks": [], + "average_score": 0.24110595572817994 + }, + "Mathematics": { + "count": 33, + "num_samples": 547, + "tasks": [], + "average_score": 0.20711160718581811 + } + } + } +} \ No newline at end of file