scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value hellaswag,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 humaneval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 mbpp,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 winogrande,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 grounding,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676 instruction_following,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676 planning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 reasoning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 refinement,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 safety,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 theory_of_mind,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333 tool_usage,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 livebench_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 reasoning_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 coding_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 mathematics_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 data_analysis_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334 language_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 if_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 arena_hard,arena_hard_2404,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 mixeval,mixeval_240601,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333 agieval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 arc_c,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334 alpacav1,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 alpacav2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334 alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 arena_elo,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 bbh,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 eq_benchv2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 gpt4all,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.0,1.0 hugging_6,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 llmonitor,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334 magi,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 mmlu,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 mt_bench,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334 biggen_mwr,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,hellaswag,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 aggregate,holistic,humaneval,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 aggregate,holistic,mbpp,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 aggregate,holistic,winogrande,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,grounding,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676 aggregate,holistic,instruction_following,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676 aggregate,holistic,planning,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,reasoning,biggen_240612,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 aggregate,holistic,refinement,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,safety,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,theory_of_mind,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333 aggregate,holistic,tool_usage,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,livebench_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,reasoning_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,coding_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,mathematics_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,data_analysis_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334 aggregate,holistic,language_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,if_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,arena_hard,arena_hard_2404,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 aggregate,holistic,mixeval,mixeval_240601,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333 aggregate,holistic,agieval,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 aggregate,holistic,arc_c,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334 aggregate,holistic,alpacav1,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 aggregate,holistic,alpacav2,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334 aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 aggregate,holistic,arena_elo,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 aggregate,holistic,bbh,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425 aggregate,holistic,eq_benchv2,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666 aggregate,holistic,gpt4all,BLZ_240312,kendall,top_aggregate,5,0,0.0,1.0 aggregate,holistic,hugging_6,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334 aggregate,holistic,llmonitor,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334 aggregate,holistic,magi,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174 aggregate,holistic,mmlu,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667 aggregate,holistic,mt_bench,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334 aggregate,holistic,biggen_mwr,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334