scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,4,0.11553071904436202,0.7852997192967395 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8743737489954189,0.004501296794893102 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8019858294586086,0.01664169341252048 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865218326418788,0.005519059390504801 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.1162588388208577,0.78397092283469 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5719061307929368,0.1385541569597628 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7247956777996108,0.04194484960329344 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2767660595168839,0.5069548295866992 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3337223270100439,0.4191769676693079 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6126891094585267,0.10632638977302632 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8079257463851817,0.015261307993340337 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6647150497002838,0.07212235537894374 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7704800482268904,0.025262942539415363 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9028773381740962,0.002126756432137772 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.748982925973149,0.032470780295939985 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8599957450436625,0.006160409391629476 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8718735582848011,0.004766072993988772 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9069576656171551,0.001875739334441522 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8905328662549648,0.003016032865892646 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5058552901713423,0.20090402274559316 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6767432630833718,0.0652968761285632 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7135518769682414,0.04685902831102101 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.873661116609048,0.004575776138454243 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8775217778627072,0.004181622363896538 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7683490298001087,0.025928082489068475 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.741463148953373,0.035258455741147623 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7891209052525207,0.019892902878583873 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8157900850650412,0.013547661219765379 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8625206786227912,0.005844699973375535 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.49625129009057833,0.211004712621783 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7482300147416783,0.0327435760119495 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8540419074377281,0.00694751386877189 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827735900001105,0.021632253958226707 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7416615606437577,0.03518309274676423 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8208959354305796,0.01250307893717913 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7807842071724994,0.022196180227557687 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6016089012086534,0.11460809097860054 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.85978308688271,0.006187486327563118 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780599537830846,0.022248986205867058 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.753379355065838,0.030905705190702806 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8379676352721162,0.009384640911630616 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8462209992405952,0.008075105621350536 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9392379026634557,0.000535591367028614 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7412355057774336,0.035345043191044964 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8195179387247324,0.01277979740900836 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7466011946729814,0.03333852605723143 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.46353588273705637,0.24734250900688215 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8866352243352398,0.003339629955133934 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937902652612242,0.0005710971446370687 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.2831911510498836,0.4967225093410736 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2031844122583542,0.6293846722461313 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.549284007260608,0.15849945140105312 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7889373199563972,0.01994193933246426 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7526431927239958,0.0311644661156264 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937590300147702,0.0005796196796032962 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5831241321997315,0.12921116102954364 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5561145441014004,0.1523217142123119 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5664450708720614,0.14323389729888122 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.47517181530974595,0.23407895750101468 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.718855715365913,0.04449992445427745 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7168604276016974,0.04537877960385103 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.18264726732113173,0.6650765454064547 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.04614314940391431,0.9136043258512831 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6369093478690498,0.08944819108801377 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8650362997962656,0.005540656777637369 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8579024362848122,0.006430262194723998 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7303458809128464,0.03963972108447683 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7466964409211542,0.03330355520543848 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9036719475219376,0.002076262347775526 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7623592248502944,0.02785522986224059 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8349964637145074,0.009887030967730168 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8000397965603336,0.01711033114623395 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7666453684194998,0.026467542617941944 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8751438663188438,0.004421691058140597 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8195357136433342,0.012776203631959988 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7428545649568395,0.03473202812850355 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8136140570811612,0.01400900062666989 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5749045753814719,0.13602130778385005 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780595487125304,0.022250145374352125 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8389921086523722,0.009215256295109017 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8803463320171083,0.003907570379771439 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7142670311425445,0.04653663665491792 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7977979460712193,0.017660348313797546 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7240026280446691,0.04228069432019545 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8051290094703403,0.01590190576987268 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.770582228125362,0.025231318204288148 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5188109005585113,0.18769119165787862 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7836454491081474,0.021387948565361206 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865235745718993,0.005516995432107779 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,3,0.819500116935474,0.012783401302719894 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7432637726714306,0.034578129186903464 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6730282904268812,0.06736225845470355 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8618105831276622,0.005932414266978994 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8046621876144952,0.01601044603512172 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.41770329390345684,0.30313696659492734 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6529975286213465,0.07915856325659755 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6950517775314824,0.05566978580633573 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5130382972054114,0.19351964488420637 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6825577913683614,0.062140382561143265 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8702987510549938,0.00493787146977232 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8349295032906534,0.009898545248446817 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8245663895988613,0.011784555837564846 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.899783088468177,0.002330962388754791 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8724919719311256,0.004699674798249593 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.926933634016331,0.000922537739358256 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6984411569502376,0.05398723363884652 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.754828418128203,0.03040022622820331 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5655988276473191,0.14396676855997925 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9407474980820671,0.000497230334167822 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.770589245932409,0.025229147116181697 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7775815292717585,0.023123063813025962 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5611200837416681,0.14787988852194642 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.34646366697352105,0.40049416986179387 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7868643731535557,0.020500867535993103 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8114670933196435,0.014473750045325934 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.4013581254554363,0.32436552572418753 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.28341806840646894,0.4963625961904983 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3139211847524032,0.44892434309679713 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2606167560977108,0.5330194398770082 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.32260154615753545,0.43577896021471924 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827817854375669,0.021629949458519884 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5386185630062554,0.16841388744478442 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7045551126623175,0.05103000019308416 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8414540075802577,0.00881618884168942 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8614522174161048,0.005976999431835443 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7878166990611953,0.02024289628983945 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8381151096374623,0.009360136935052572 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.876154278920616,0.0043186280005204514 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8661864185981796,0.005405102460401999 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8297856426405835,0.010808669505560614 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9329487606730291,0.000716243089312378 MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.822202489777381,0.01224422861798353 MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6968865871905413,0.05475511707469452 MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7939152572032528,0.018638835543465734 MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7761614135775217,0.02354161442763604 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5935991848770941,0.12081484777974201 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9131963004520903,0.001530535130781307 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7594573765014532,0.02881968270449265 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6622792441367216,0.07355344210000651 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5835165093102912,0.1288909419896904 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7271748558955601,0.04094703171178795 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7369082697183147,0.0370157216672518 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7219159720057066,0.04317213020613491 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973595810319037,0.002499476856786579 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6540145328427245,0.07853263145320354 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.798793471524343,0.017414760604056785 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.766501585020503,0.026513385703318352 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6776894663079587,0.06477689572321889 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6576248245381009,0.07633405000799688 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.796342090311639,0.018023378799051942 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.689140856921657,0.058678219175095074 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6705942614169457,0.06873614015066103 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6842754194067544,0.0612256583562849 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7338112096805872,0.03824046140795786 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8522000994286094,0.007203358614415384 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7479170810940026,0.03285737031031745 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5899049701184135,0.1237398240474465 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.864013241961245,0.005663050469813282 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.726560560314063,0.04120326937800088 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7600546147835674,0.02861953111724766 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8675817638279608,0.00524352512595729 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4358953069712842,0.280322780055143 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8724977849323057,0.004699053502733089 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.871502377377448,0.004806214049293794 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.636462032322589,0.08974474991245225 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7407371067623334,0.035535069908202585 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.13754152986907456,0.7453436298315592 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8584434869588686,0.006359804257501524 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.429513562091493,0.2882272134157949 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7258395762861067,0.04150524782255408 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4140057077993773,0.3078793667149351 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8860840192325219,0.003387122941063616 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8531999374729967,0.007063738601380546 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.570698753672453,0.13958138247636556 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,1,0.820982530302196,0.012485817170678851 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9051882617143683,0.001982079878231783 OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8448816290057799,0.008279149903754354 OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7138885174194392,0.046707103452906885 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.40763933138747765,0.3161269846214854 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5033557119680766,0.20350786972733814 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4943676910774294,0.21301612937354739 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.3662549994154035,0.3722134961617391 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6943274080319848,0.05603338677616118 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.888202282224346,0.0032069637473251308 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.862959786938574,0.0057908774192851585 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4422315456206938,0.2725814015162671 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8576726697477571,0.006460333718352682 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6477798867796105,0.08241558395766836 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7105249096891054,0.04823848031855015 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7433756448219943,0.034536127920169364 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.465629371128827,0.24492880327618063 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8115257987039834,0.014460915122317916 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8840656907304268,0.003564741739845647 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8762491857760322,0.004309027650395265 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.822174167720692,0.012249803466994006 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8388480886223416,0.009238949980481774 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6752208316271633,0.06613869004956173 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7677373687773497,0.026120973578910495 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7919204265038193,0.01915443839404165 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8238198607264919,0.01192852239680578 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5937971020205063,0.1206592532108973 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6743688104667733,0.0666125934693148 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6092910701405022,0.10882867605607495 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.02436876480189197,0.954326651607438 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7114255278499215,0.04782552820112736 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5887872724291499,0.12463254240428198 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4029552549015283,0.32226121873409685 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.19589220319331574,0.6419903458052949 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5147894627560958,0.1917415408232741 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.43696792691727815,0.2790047957490856 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8698029729880158,0.00499276771087744 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8938963574061565,0.002753683842916408 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8197843971795349,0.012725991028944833 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9482689395026054,0.000332805134027447 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8996453255999854,0.00234033776853281 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8662449830102448,0.005398257529969565 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9469225816315634,0.000359102582060145 LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.790872393374341,0.019428850798750914 LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7384692720332464,0.03640761031575469 LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7853349194194776,0.020919442242219075 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8636070293544758,0.005712124057773506 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.837126038633602,0.009525258316342535 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7663953319208139,0.026547294337781743 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8834569465544357,0.00361946726545403 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8480938359553485,0.00779520658099071 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8793267175321069,0.004005119722136405 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8760721346635911,0.004326948446281908 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7675767218262903,0.026171781192995118 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8483878251754778,0.007751839541749867 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.692434840005101,0.056990052908859494 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9412248237761267,0.000485487558057933 WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7164442699126142,0.04556339297891151 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5643812833359342,0.14502482192576685 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.4448334653124403,0.269433453257965 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9020957808919513,0.002177191904645508 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6613543728531551,0.07410115498793113 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.4797794956768499,0.2289297958345603 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.49503702005526434,0.21230024172428238 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8658004484348707,0.005450353400185282 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6878185417270377,0.05936418242167244 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.6427492187377651,0.08562857067256696 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.19987101474191585,0.6351028985023905 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.7695981699173929,0.025536900476404875 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.851160886507116,0.00735033097799936 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7535063061583401,0.030861215825263487 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.26946310602236634,0.5186811891252074 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5071239778851739,0.19958915881626008 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.845558834843199,0.00817557674320208 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8223598748455347,0.01221327849153134 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7520379034546343,0.03137821860478068 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5986152394502113,0.1169062576526029 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.766509325140422,0.026510916638992615 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.6388656044215879,0.08815791552969902 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8220592376168137,0.012272442496278822 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.1610992186087647,0.7031245257171708 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.22938177579714764,0.584757473087143 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.16217150942988084,0.7012176634258844 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7448797028215589,0.033974472983626124 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.436470242791583,0.2796159471960331 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5113717481429286,0.195219904727713 BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,2,0.7024798803756629,0.05202256738347333 BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,3,0.6111548412929141,0.10745210550108082 BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8443252756395498,0.008364861793357709 BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8199557285303699,0.012691469447090417 BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6898121736766818,0.05833178396126367 BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.1445400076243653,0.732738456710739 BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8543556725359636,0.006904516600543572 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7671160990392422,0.026317800283773948 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4230508906614041,0.29634091151848907 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.29492042180464345,0.478252042515081 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8192056092552416,0.01284304904344425 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8053230426409881,0.015856927546595193 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6785867773117831,0.06428605698561919 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.021028776761034942,0.960582665935811 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8101772449555595,0.014757563523095152 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7844308170919763,0.021169355122089707 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6407686957715764,0.08691312009391092 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.042093006210129874,0.9211687904012325 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8350456630970934,0.00987857623206292 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.879311548672376,0.004006582681021272 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6951300585252861,0.0556305769370549 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.30955291195703166,0.4556002793087552 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.09897629382276267,0.8156278898050575 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8313126956210078,0.010533178480029779 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8169388413464165,0.01330802664448977 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8065284450649773,0.015579295379409611 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.23722382427262312,0.5716108619128892 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.026088426326565897,0.9511063910298649 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5558829816104426,0.15252894598370506 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6390946692796851,0.08800754271923365 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.24121345447897227,0.5649619826999719 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7467577882406231,0.03328104267130768 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7611545287510072,0.028253164658278467 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6541774611460981,0.07843262445172178 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.0830822493170678,0.8449361587214159 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9103256104990007,0.001683717098370581 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8079204807250888,0.015262498588799642 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7253154362419392,0.0417256201301186 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2776474358858506,0.5055464711128136 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.919432996814919,0.0012296819224052442 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.87005129824662,0.004965222567299112 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9073703100625691,0.001851485138509531 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8673887162219034,0.005265692212272121 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9165887813382055,0.001361572704071016 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9292369266176062,0.000839501038985727 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9317002702003969,0.000756276259880365 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8240635545541923,0.011881405061211926 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9498225876442147,0.000304071618749767 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8197292667265523,0.012737111858293043 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9413025091864188,0.000483593804288479 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9083254977326705,0.001796125778484392 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.8626635526406192,0.005827152548807454 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8043418970652331,0.016085184583393794 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9025950086780581,0.002144887259438991 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.7564264003460613,0.02984872863501939 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9033527343998258,0.002096452391428316 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8534145445088147,0.007033997470343221 aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,4,0.11553071904436202,0.7852997192967395 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,0,0.8743737489954189,0.004501296794893102 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,1,0.8019858294586086,0.01664169341252048 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,2,0.865218326418788,0.005519059390504801 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,2,-0.1162588388208577,0.78397092283469 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,0,0.5719061307929368,0.1385541569597628 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,0,0.7247956777996108,0.04194484960329344 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,1,0.2767660595168839,0.5069548295866992 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,2,0.3337223270100439,0.4191769676693079 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,3,0.6126891094585267,0.10632638977302632 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,4,0.8079257463851817,0.015261307993340337 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,0,0.6647150497002838,0.07212235537894374 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,3,0.7704800482268904,0.025262942539415363 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,4,0.9028773381740962,0.002126756432137772 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,0,0.748982925973149,0.032470780295939985 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,1,0.8599957450436625,0.006160409391629476 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,2,0.8718735582848011,0.004766072993988772 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,3,0.9069576656171551,0.001875739334441522 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,0,0.8905328662549648,0.003016032865892646 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,1,0.5058552901713423,0.20090402274559316 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,2,0.6767432630833718,0.0652968761285632 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,3,0.7135518769682414,0.04685902831102101 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,4,0.873661116609048,0.004575776138454243 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,0,0.8775217778627072,0.004181622363896538 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,1,0.7683490298001087,0.025928082489068475 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,2,0.741463148953373,0.035258455741147623 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,3,0.7891209052525207,0.019892902878583873 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,4,0.8157900850650412,0.013547661219765379 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,0,0.8625206786227912,0.005844699973375535 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,1,0.49625129009057833,0.211004712621783 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,2,0.7482300147416783,0.0327435760119495 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,4,0.8540419074377281,0.00694751386877189 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,0,0.7827735900001105,0.021632253958226707 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,1,0.7416615606437577,0.03518309274676423 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,2,0.8208959354305796,0.01250307893717913 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,0,0.7807842071724994,0.022196180227557687 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,1,0.6016089012086534,0.11460809097860054 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,2,0.85978308688271,0.006187486327563118 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.780599537830846,0.022248986205867058 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.753379355065838,0.030905705190702806 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8379676352721162,0.009384640911630616 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8462209992405952,0.008075105621350536 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9392379026634557,0.000535591367028614 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.7412355057774336,0.035345043191044964 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8195179387247324,0.01277979740900836 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7466011946729814,0.03333852605723143 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.46353588273705637,0.24734250900688215 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8866352243352398,0.003339629955133934 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937902652612242,0.0005710971446370687 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.2831911510498836,0.4967225093410736 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.2031844122583542,0.6293846722461313 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.549284007260608,0.15849945140105312 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.7889373199563972,0.01994193933246426 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.7526431927239958,0.0311644661156264 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937590300147702,0.0005796196796032962 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.5831241321997315,0.12921116102954364 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.5561145441014004,0.1523217142123119 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.5664450708720614,0.14323389729888122 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.47517181530974595,0.23407895750101468 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.718855715365913,0.04449992445427745 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7168604276016974,0.04537877960385103 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.18264726732113173,0.6650765454064547 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.04614314940391431,0.9136043258512831 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.6369093478690498,0.08944819108801377 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.8650362997962656,0.005540656777637369 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,1,0.8579024362848122,0.006430262194723998 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.7303458809128464,0.03963972108447683 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7466964409211542,0.03330355520543848 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.9036719475219376,0.002076262347775526 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.7623592248502944,0.02785522986224059 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,1,0.8349964637145074,0.009887030967730168 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.8000397965603336,0.01711033114623395 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7666453684194998,0.026467542617941944 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8751438663188438,0.004421691058140597 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.8195357136433342,0.012776203631959988 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,0,0.7428545649568395,0.03473202812850355 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,3,0.8136140570811612,0.01400900062666989 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,4,0.5749045753814719,0.13602130778385005 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,0,0.780595487125304,0.022250145374352125 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8389921086523722,0.009215256295109017 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,2,0.8803463320171083,0.003907570379771439 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,3,0.7142670311425445,0.04653663665491792 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,4,0.7977979460712193,0.017660348313797546 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,0,0.7240026280446691,0.04228069432019545 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8051290094703403,0.01590190576987268 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,3,0.770582228125362,0.025231318204288148 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,4,0.5188109005585113,0.18769119165787862 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,1,0.7836454491081474,0.021387948565361206 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,2,0.865235745718993,0.005516995432107779 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,3,0.819500116935474,0.012783401302719894 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,4,0.7432637726714306,0.034578129186903464 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,0,0.6730282904268812,0.06736225845470355 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,2,0.8618105831276622,0.005932414266978994 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,4,0.8046621876144952,0.01601044603512172 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,0,0.41770329390345684,0.30313696659492734 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,1,0.6529975286213465,0.07915856325659755 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,2,0.6950517775314824,0.05566978580633573 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,3,0.5130382972054114,0.19351964488420637 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,4,0.6825577913683614,0.062140382561143265 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,1,0.8702987510549938,0.00493787146977232 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,2,0.8349295032906534,0.009898545248446817 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,3,0.8245663895988613,0.011784555837564846 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,0,0.899783088468177,0.002330962388754791 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,1,0.8724919719311256,0.004699674798249593 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,4,0.926933634016331,0.000922537739358256 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,0,0.6984411569502376,0.05398723363884652 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,1,0.754828418128203,0.03040022622820331 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,2,0.5655988276473191,0.14396676855997925 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,3,0.9407474980820671,0.000497230334167822 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,4,0.770589245932409,0.025229147116181697 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,0,0.7775815292717585,0.023123063813025962 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,1,0.5611200837416681,0.14787988852194642 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,2,0.34646366697352105,0.40049416986179387 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,3,0.7868643731535557,0.020500867535993103 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,4,0.8114670933196435,0.014473750045325934 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,0,0.4013581254554363,0.32436552572418753 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,1,0.28341806840646894,0.4963625961904983 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,2,0.3139211847524032,0.44892434309679713 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,3,0.2606167560977108,0.5330194398770082 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,4,0.32260154615753545,0.43577896021471924 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,0,0.7827817854375669,0.021629949458519884 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,2,0.5386185630062554,0.16841388744478442 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,3,0.7045551126623175,0.05103000019308416 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,4,0.8414540075802577,0.00881618884168942 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,1,0.8614522174161048,0.005976999431835443 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,2,0.7878166990611953,0.02024289628983945 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,3,0.8381151096374623,0.009360136935052572 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,4,0.876154278920616,0.0043186280005204514 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,2,0.8661864185981796,0.005405102460401999 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,3,0.8297856426405835,0.010808669505560614 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,4,0.9329487606730291,0.000716243089312378 aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,0,0.822202489777381,0.01224422861798353 aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,1,0.6968865871905413,0.05475511707469452 aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,3,0.7939152572032528,0.018638835543465734 aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,4,0.7761614135775217,0.02354161442763604 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,1,0.5935991848770941,0.12081484777974201 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,3,0.9131963004520903,0.001530535130781307 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,4,0.7594573765014532,0.02881968270449265 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,0,0.6622792441367216,0.07355344210000651 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,1,0.5835165093102912,0.1288909419896904 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,2,0.7271748558955601,0.04094703171178795 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,3,0.7369082697183147,0.0370157216672518 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,4,0.7219159720057066,0.04317213020613491 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,0,0.8973595810319037,0.002499476856786579 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,1,0.6540145328427245,0.07853263145320354 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,3,0.798793471524343,0.017414760604056785 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,4,0.766501585020503,0.026513385703318352 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,0,0.6776894663079587,0.06477689572321889 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,1,0.6576248245381009,0.07633405000799688 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,2,0.796342090311639,0.018023378799051942 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,3,0.689140856921657,0.058678219175095074 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,4,0.6705942614169457,0.06873614015066103 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,0,0.6842754194067544,0.0612256583562849 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,1,0.7338112096805872,0.03824046140795786 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,4,0.8522000994286094,0.007203358614415384 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,0,0.7479170810940026,0.03285737031031745 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,1,0.5899049701184135,0.1237398240474465 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,2,0.864013241961245,0.005663050469813282 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,3,0.726560560314063,0.04120326937800088 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,4,0.7600546147835674,0.02861953111724766 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,0,0.8675817638279608,0.00524352512595729 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,1,0.4358953069712842,0.280322780055143 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,2,0.8724977849323057,0.004699053502733089 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,3,0.871502377377448,0.004806214049293794 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,4,0.636462032322589,0.08974474991245225 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,0,0.7407371067623334,0.035535069908202585 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,1,0.13754152986907456,0.7453436298315592 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,2,0.8584434869588686,0.006359804257501524 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,4,0.429513562091493,0.2882272134157949 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,0,0.7258395762861067,0.04150524782255408 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,1,0.4140057077993773,0.3078793667149351 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,2,0.8860840192325219,0.003387122941063616 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,3,0.8531999374729967,0.007063738601380546 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,4,0.570698753672453,0.13958138247636556 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,1,0.820982530302196,0.012485817170678851 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,2,0.9051882617143683,0.001982079878231783 aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,3,0.8448816290057799,0.008279149903754354 aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,0,0.7138885174194392,0.046707103452906885 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,1,0.40763933138747765,0.3161269846214854 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,2,0.5033557119680766,0.20350786972733814 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,3,0.4943676910774294,0.21301612937354739 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,4,0.3662549994154035,0.3722134961617391 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,0,0.6943274080319848,0.05603338677616118 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,1,0.888202282224346,0.0032069637473251308 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,2,0.862959786938574,0.0057908774192851585 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,3,0.4422315456206938,0.2725814015162671 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,0,0.8576726697477571,0.006460333718352682 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,1,0.6477798867796105,0.08241558395766836 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,2,0.7105249096891054,0.04823848031855015 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,3,0.7433756448219943,0.034536127920169364 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,4,0.465629371128827,0.24492880327618063 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,3,0.8115257987039834,0.014460915122317916 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,4,0.8840656907304268,0.003564741739845647 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,1,0.8762491857760322,0.004309027650395265 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,2,0.822174167720692,0.012249803466994006 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,3,0.8388480886223416,0.009238949980481774 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,0,0.6752208316271633,0.06613869004956173 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,1,0.7677373687773497,0.026120973578910495 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,2,0.7919204265038193,0.01915443839404165 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,3,0.8238198607264919,0.01192852239680578 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,0,0.5937971020205063,0.1206592532108973 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,1,0.6743688104667733,0.0666125934693148 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,2,0.6092910701405022,0.10882867605607495 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,3,0.02436876480189197,0.954326651607438 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,4,0.7114255278499215,0.04782552820112736 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,0,0.5887872724291499,0.12463254240428198 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,1,0.4029552549015283,0.32226121873409685 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,2,0.19589220319331574,0.6419903458052949 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,3,0.5147894627560958,0.1917415408232741 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,4,0.43696792691727815,0.2790047957490856 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,0,0.8698029729880158,0.00499276771087744 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,2,0.8938963574061565,0.002753683842916408 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,2,0.8197843971795349,0.012725991028944833 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,1,0.9482689395026054,0.000332805134027447 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,1,0.8996453255999854,0.00234033776853281 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,2,0.8662449830102448,0.005398257529969565 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,0,0.9469225816315634,0.000359102582060145 aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,1,0.790872393374341,0.019428850798750914 aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,2,0.7384692720332464,0.03640761031575469 aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,4,0.7853349194194776,0.020919442242219075 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,0,0.8636070293544758,0.005712124057773506 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,1,0.837126038633602,0.009525258316342535 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,2,0.7663953319208139,0.026547294337781743 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,3,0.8834569465544357,0.00361946726545403 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,4,0.8480938359553485,0.00779520658099071 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,0,0.8793267175321069,0.004005119722136405 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,1,0.8760721346635911,0.004326948446281908 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,4,0.7675767218262903,0.026171781192995118 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,0,0.8483878251754778,0.007751839541749867 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,4,0.692434840005101,0.056990052908859494 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,2,0.9412248237761267,0.000485487558057933 aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7164442699126142,0.04556339297891151 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5643812833359342,0.14502482192576685 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.4448334653124403,0.269433453257965 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9020957808919513,0.002177191904645508 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6613543728531551,0.07410115498793113 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.4797794956768499,0.2289297958345603 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.49503702005526434,0.21230024172428238 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8658004484348707,0.005450353400185282 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6878185417270377,0.05936418242167244 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.6427492187377651,0.08562857067256696 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.19987101474191585,0.6351028985023905 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.7695981699173929,0.025536900476404875 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.851160886507116,0.00735033097799936 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7535063061583401,0.030861215825263487 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.26946310602236634,0.5186811891252074 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5071239778851739,0.19958915881626008 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.845558834843199,0.00817557674320208 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8223598748455347,0.01221327849153134 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7520379034546343,0.03137821860478068 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5986152394502113,0.1169062576526029 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.766509325140422,0.026510916638992615 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.6388656044215879,0.08815791552969902 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8220592376168137,0.012272442496278822 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.1610992186087647,0.7031245257171708 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.22938177579714764,0.584757473087143 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.16217150942988084,0.7012176634258844 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7448797028215589,0.033974472983626124 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.436470242791583,0.2796159471960331 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5113717481429286,0.195219904727713 aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,2,0.7024798803756629,0.05202256738347333 aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,3,0.6111548412929141,0.10745210550108082 aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,0,0.8443252756395498,0.008364861793357709 aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,1,0.8199557285303699,0.012691469447090417 aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,2,0.6898121736766818,0.05833178396126367 aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,3,0.1445400076243653,0.732738456710739 aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,1,0.8543556725359636,0.006904516600543572 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,2,0.7671160990392422,0.026317800283773948 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,3,0.4230508906614041,0.29634091151848907 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,4,0.29492042180464345,0.478252042515081 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,0,0.8192056092552416,0.01284304904344425 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,1,0.8053230426409881,0.015856927546595193 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,2,0.6785867773117831,0.06428605698561919 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,3,0.021028776761034942,0.960582665935811 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,0,0.8101772449555595,0.014757563523095152 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,1,0.7844308170919763,0.021169355122089707 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,2,0.6407686957715764,0.08691312009391092 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,3,0.042093006210129874,0.9211687904012325 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,0,0.8350456630970934,0.00987857623206292 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,1,0.879311548672376,0.004006582681021272 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,2,0.6951300585252861,0.0556305769370549 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,3,0.30955291195703166,0.4556002793087552 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,4,0.09897629382276267,0.8156278898050575 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,0,0.8313126956210078,0.010533178480029779 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,1,0.8169388413464165,0.01330802664448977 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,2,0.8065284450649773,0.015579295379409611 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,3,0.23722382427262312,0.5716108619128892 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,4,0.026088426326565897,0.9511063910298649 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,0,0.5558829816104426,0.15252894598370506 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,1,0.6390946692796851,0.08800754271923365 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,2,0.24121345447897227,0.5649619826999719 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,0,0.7467577882406231,0.03328104267130768 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,1,0.7611545287510072,0.028253164658278467 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,2,0.6541774611460981,0.07843262445172178 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,3,0.0830822493170678,0.8449361587214159 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,0,0.9103256104990007,0.001683717098370581 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,1,0.8079204807250888,0.015262498588799642 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,2,0.7253154362419392,0.0417256201301186 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,3,0.2776474358858506,0.5055464711128136 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,0,0.919432996814919,0.0012296819224052442 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,1,0.87005129824662,0.004965222567299112 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,2,0.9073703100625691,0.001851485138509531 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,3,0.8673887162219034,0.005265692212272121 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,3,0.9165887813382055,0.001361572704071016 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,0,0.9292369266176062,0.000839501038985727 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,3,0.9317002702003969,0.000756276259880365 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,4,0.8240635545541923,0.011881405061211926 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,1,0.9498225876442147,0.000304071618749767 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,3,0.8197292667265523,0.012737111858293043 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,0,0.9413025091864188,0.000483593804288479 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,1,0.9083254977326705,0.001796125778484392 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,2,0.8626635526406192,0.005827152548807454 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,3,0.8043418970652331,0.016085184583393794 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,0,0.9025950086780581,0.002144887259438991 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,1,0.7564264003460613,0.02984872863501939 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,2,0.9033527343998258,0.002096452391428316 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,4,0.8534145445088147,0.007033997470343221