Spaces:
Running
Running
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value | |
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 | |
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 | |
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 | |
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 | |
Holmes,holmes_240829.csv,aggregate,aggregate,pearson,random,8,4,0.11553071904436202,0.7852997192967395 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8743737489954189,0.004501296794893102 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8019858294586086,0.01664169341252048 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865218326418788,0.005519059390504801 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.1162588388208577,0.78397092283469 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5719061307929368,0.1385541569597628 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7247956777996108,0.04194484960329344 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2767660595168839,0.5069548295866992 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3337223270100439,0.4191769676693079 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6126891094585267,0.10632638977302632 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8079257463851817,0.015261307993340337 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6647150497002838,0.07212235537894374 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7704800482268904,0.025262942539415363 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9028773381740962,0.002126756432137772 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.748982925973149,0.032470780295939985 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8599957450436625,0.006160409391629476 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8718735582848011,0.004766072993988772 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9069576656171551,0.001875739334441522 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8905328662549648,0.003016032865892646 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5058552901713423,0.20090402274559316 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6767432630833718,0.0652968761285632 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7135518769682414,0.04685902831102101 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.873661116609048,0.004575776138454243 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8775217778627072,0.004181622363896538 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7683490298001087,0.025928082489068475 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.741463148953373,0.035258455741147623 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7891209052525207,0.019892902878583873 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8157900850650412,0.013547661219765379 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8625206786227912,0.005844699973375535 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.49625129009057833,0.211004712621783 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7482300147416783,0.0327435760119495 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8540419074377281,0.00694751386877189 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827735900001105,0.021632253958226707 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7416615606437577,0.03518309274676423 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8208959354305796,0.01250307893717913 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7807842071724994,0.022196180227557687 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6016089012086534,0.11460809097860054 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,2,0.85978308688271,0.006187486327563118 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780599537830846,0.022248986205867058 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.753379355065838,0.030905705190702806 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8379676352721162,0.009384640911630616 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8462209992405952,0.008075105621350536 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9392379026634557,0.000535591367028614 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7412355057774336,0.035345043191044964 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8195179387247324,0.01277979740900836 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7466011946729814,0.03333852605723143 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.46353588273705637,0.24734250900688215 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8866352243352398,0.003339629955133934 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937902652612242,0.0005710971446370687 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.2831911510498836,0.4967225093410736 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.2031844122583542,0.6293846722461313 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.549284007260608,0.15849945140105312 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7889373199563972,0.01994193933246426 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7526431927239958,0.0311644661156264 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.937590300147702,0.0005796196796032962 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5831241321997315,0.12921116102954364 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5561145441014004,0.1523217142123119 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5664450708720614,0.14323389729888122 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.47517181530974595,0.23407895750101468 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.718855715365913,0.04449992445427745 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7168604276016974,0.04537877960385103 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,1,0.18264726732113173,0.6650765454064547 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,2,0.04614314940391431,0.9136043258512831 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,3,0.6369093478690498,0.08944819108801377 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8650362997962656,0.005540656777637369 | |
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 | |
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8579024362848122,0.006430262194723998 | |
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 | |
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 | |
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 | |
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7303458809128464,0.03963972108447683 | |
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7466964409211542,0.03330355520543848 | |
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 | |
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9036719475219376,0.002076262347775526 | |
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7623592248502944,0.02785522986224059 | |
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 | |
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8349964637145074,0.009887030967730168 | |
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 | |
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 | |
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 | |
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8000397965603336,0.01711033114623395 | |
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7666453684194998,0.026467542617941944 | |
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8751438663188438,0.004421691058140597 | |
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 | |
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8195357136433342,0.012776203631959988 | |
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 | |
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 | |
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 | |
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 | |
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 | |
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7428545649568395,0.03473202812850355 | |
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 | |
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 | |
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8136140570811612,0.01400900062666989 | |
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5749045753814719,0.13602130778385005 | |
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.780595487125304,0.022250145374352125 | |
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8389921086523722,0.009215256295109017 | |
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8803463320171083,0.003907570379771439 | |
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7142670311425445,0.04653663665491792 | |
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7977979460712193,0.017660348313797546 | |
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7240026280446691,0.04228069432019545 | |
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8051290094703403,0.01590190576987268 | |
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 | |
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.770582228125362,0.025231318204288148 | |
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.5188109005585113,0.18769119165787862 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7836454491081474,0.021387948565361206 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,2,0.865235745718993,0.005516995432107779 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,3,0.819500116935474,0.012783401302719894 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7432637726714306,0.034578129186903464 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6730282904268812,0.06736225845470355 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8618105831276622,0.005932414266978994 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8046621876144952,0.01601044603512172 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.41770329390345684,0.30313696659492734 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6529975286213465,0.07915856325659755 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6950517775314824,0.05566978580633573 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5130382972054114,0.19351964488420637 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6825577913683614,0.062140382561143265 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8702987510549938,0.00493787146977232 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8349295032906534,0.009898545248446817 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8245663895988613,0.011784555837564846 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.899783088468177,0.002330962388754791 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8724919719311256,0.004699674798249593 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.926933634016331,0.000922537739358256 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6984411569502376,0.05398723363884652 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.754828418128203,0.03040022622820331 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5655988276473191,0.14396676855997925 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9407474980820671,0.000497230334167822 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.770589245932409,0.025229147116181697 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7775815292717585,0.023123063813025962 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5611200837416681,0.14787988852194642 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.34646366697352105,0.40049416986179387 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7868643731535557,0.020500867535993103 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8114670933196435,0.014473750045325934 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.4013581254554363,0.32436552572418753 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.28341806840646894,0.4963625961904983 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.3139211847524032,0.44892434309679713 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2606167560977108,0.5330194398770082 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.32260154615753545,0.43577896021471924 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7827817854375669,0.021629949458519884 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5386185630062554,0.16841388744478442 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7045551126623175,0.05103000019308416 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8414540075802577,0.00881618884168942 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8614522174161048,0.005976999431835443 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7878166990611953,0.02024289628983945 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8381151096374623,0.009360136935052572 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,pearson,random,8,4,0.876154278920616,0.0043186280005204514 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8661864185981796,0.005405102460401999 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8297856426405835,0.010808669505560614 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9329487606730291,0.000716243089312378 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.822202489777381,0.01224422861798353 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6968865871905413,0.05475511707469452 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7939152572032528,0.018638835543465734 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7761614135775217,0.02354161442763604 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5935991848770941,0.12081484777974201 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9131963004520903,0.001530535130781307 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7594573765014532,0.02881968270449265 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6622792441367216,0.07355344210000651 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5835165093102912,0.1288909419896904 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7271748558955601,0.04094703171178795 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7369082697183147,0.0370157216672518 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7219159720057066,0.04317213020613491 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8973595810319037,0.002499476856786579 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6540145328427245,0.07853263145320354 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.798793471524343,0.017414760604056785 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.766501585020503,0.026513385703318352 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6776894663079587,0.06477689572321889 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6576248245381009,0.07633405000799688 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.796342090311639,0.018023378799051942 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.689140856921657,0.058678219175095074 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.6705942614169457,0.06873614015066103 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6842754194067544,0.0612256583562849 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7338112096805872,0.03824046140795786 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8522000994286094,0.007203358614415384 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7479170810940026,0.03285737031031745 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.5899049701184135,0.1237398240474465 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.864013241961245,0.005663050469813282 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.726560560314063,0.04120326937800088 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7600546147835674,0.02861953111724766 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8675817638279608,0.00524352512595729 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4358953069712842,0.280322780055143 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8724977849323057,0.004699053502733089 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.871502377377448,0.004806214049293794 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.636462032322589,0.08974474991245225 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7407371067623334,0.035535069908202585 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.13754152986907456,0.7453436298315592 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8584434869588686,0.006359804257501524 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.429513562091493,0.2882272134157949 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7258395762861067,0.04150524782255408 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4140057077993773,0.3078793667149351 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8860840192325219,0.003387122941063616 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8531999374729967,0.007063738601380546 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,pearson,random,8,4,0.570698753672453,0.13958138247636556 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,1,0.820982530302196,0.012485817170678851 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 | |
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 | |
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 | |
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9051882617143683,0.001982079878231783 | |
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8448816290057799,0.008279149903754354 | |
OpenCompass,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 | |
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7138885174194392,0.046707103452906885 | |
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.40763933138747765,0.3161269846214854 | |
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.5033557119680766,0.20350786972733814 | |
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4943676910774294,0.21301612937354739 | |
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.3662549994154035,0.3722134961617391 | |
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6943274080319848,0.05603338677616118 | |
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.888202282224346,0.0032069637473251308 | |
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.862959786938574,0.0057908774192851585 | |
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4422315456206938,0.2725814015162671 | |
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 | |
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8576726697477571,0.006460333718352682 | |
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6477798867796105,0.08241558395766836 | |
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7105249096891054,0.04823848031855015 | |
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.7433756448219943,0.034536127920169364 | |
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.465629371128827,0.24492880327618063 | |
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 | |
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 | |
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 | |
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8115257987039834,0.014460915122317916 | |
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8840656907304268,0.003564741739845647 | |
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 | |
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8762491857760322,0.004309027650395265 | |
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.822174167720692,0.012249803466994006 | |
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8388480886223416,0.009238949980481774 | |
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 | |
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.6752208316271633,0.06613869004956173 | |
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7677373687773497,0.026120973578910495 | |
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7919204265038193,0.01915443839404165 | |
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8238198607264919,0.01192852239680578 | |
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 | |
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5937971020205063,0.1206592532108973 | |
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6743688104667733,0.0666125934693148 | |
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6092910701405022,0.10882867605607495 | |
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,3,0.02436876480189197,0.954326651607438 | |
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7114255278499215,0.04782552820112736 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5887872724291499,0.12463254240428198 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,1,0.4029552549015283,0.32226121873409685 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,2,0.19589220319331574,0.6419903458052949 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,3,0.5147894627560958,0.1917415408232741 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,pearson,random,8,4,0.43696792691727815,0.2790047957490856 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8698029729880158,0.00499276771087744 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8938963574061565,0.002753683842916408 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8197843971795349,0.012725991028944833 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9482689395026054,0.000332805134027447 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8996453255999854,0.00234033776853281 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8662449830102448,0.005398257529969565 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9469225816315634,0.000359102582060145 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.790872393374341,0.019428850798750914 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7384692720332464,0.03640761031575469 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7853349194194776,0.020919442242219075 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8636070293544758,0.005712124057773506 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.837126038633602,0.009525258316342535 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7663953319208139,0.026547294337781743 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8834569465544357,0.00361946726545403 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8480938359553485,0.00779520658099071 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8793267175321069,0.004005119722136405 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8760721346635911,0.004326948446281908 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.7675767218262903,0.026171781192995118 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8483878251754778,0.007751839541749867 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.692434840005101,0.056990052908859494 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9412248237761267,0.000485487558057933 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9110081304703664,0.001646433879397326 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,pearson,random,8,4,0.880586328075459,0.003884834219553849 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7164442699126142,0.04556339297891151 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5643812833359342,0.14502482192576685 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.4448334653124403,0.269433453257965 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.9020957808919513,0.002177191904645508 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6613543728531551,0.07410115498793113 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.4797794956768499,0.2289297958345603 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.49503702005526434,0.21230024172428238 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8658004484348707,0.005450353400185282 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.6878185417270377,0.05936418242167244 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.6427492187377651,0.08562857067256696 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.19987101474191585,0.6351028985023905 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.7695981699173929,0.025536900476404875 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.851160886507116,0.00735033097799936 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7535063061583401,0.030861215825263487 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.26946310602236634,0.5186811891252074 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5071239778851739,0.19958915881626008 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.845558834843199,0.00817557674320208 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8223598748455347,0.01221327849153134 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7520379034546343,0.03137821860478068 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.5986152394502113,0.1169062576526029 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.766509325140422,0.026510916638992615 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.6388656044215879,0.08815791552969902 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.8220592376168137,0.012272442496278822 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.1610992186087647,0.7031245257171708 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.22938177579714764,0.584757473087143 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.16217150942988084,0.7012176634258844 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,2,0.7448797028215589,0.033974472983626124 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,3,0.436470242791583,0.2796159471960331 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,pearson,random,8,4,0.5113717481429286,0.195219904727713 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,2,0.7024798803756629,0.05202256738347333 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,3,0.6111548412929141,0.10745210550108082 | |
BFCL,bfcl_240906.csv,aggregate,aggregate,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8443252756395498,0.008364861793357709 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8199557285303699,0.012691469447090417 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6898121736766818,0.05833178396126367 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.1445400076243653,0.732738456710739 | |
BIGGEN,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8543556725359636,0.006904516600543572 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7671160990392422,0.026317800283773948 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.4230508906614041,0.29634091151848907 | |
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.29492042180464345,0.478252042515081 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8192056092552416,0.01284304904344425 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8053230426409881,0.015856927546595193 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6785867773117831,0.06428605698561919 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.021028776761034942,0.960582665935811 | |
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8101772449555595,0.014757563523095152 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7844308170919763,0.021169355122089707 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6407686957715764,0.08691312009391092 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.042093006210129874,0.9211687904012325 | |
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8350456630970934,0.00987857623206292 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.879311548672376,0.004006582681021272 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6951300585252861,0.0556305769370549 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.30955291195703166,0.4556002793087552 | |
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.09897629382276267,0.8156278898050575 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.8313126956210078,0.010533178480029779 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8169388413464165,0.01330802664448977 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.8065284450649773,0.015579295379409611 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.23722382427262312,0.5716108619128892 | |
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.026088426326565897,0.9511063910298649 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.5558829816104426,0.15252894598370506 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.6390946692796851,0.08800754271923365 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.24121345447897227,0.5649619826999719 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 | |
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.7467577882406231,0.03328104267130768 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.7611545287510072,0.028253164658278467 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.6541774611460981,0.07843262445172178 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.0830822493170678,0.8449361587214159 | |
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.9103256104990007,0.001683717098370581 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.8079204807250888,0.015262498588799642 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.7253154362419392,0.0417256201301186 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.2776474358858506,0.5055464711128136 | |
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,0,0.919432996814919,0.0012296819224052442 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,1,0.87005129824662,0.004965222567299112 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,2,0.9073703100625691,0.001851485138509531 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,3,0.8673887162219034,0.005265692212272121 | |
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9165887813382055,0.001361572704071016 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9292369266176062,0.000839501038985727 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9317002702003969,0.000756276259880365 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8240635545541923,0.011881405061211926 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9498225876442147,0.000304071618749767 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8197292667265523,0.012737111858293043 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9413025091864188,0.000483593804288479 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.9083254977326705,0.001796125778484392 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.8626635526406192,0.005827152548807454 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8043418970652331,0.016085184583393794 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,0,0.9025950086780581,0.002144887259438991 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,1,0.7564264003460613,0.02984872863501939 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,2,0.9033527343998258,0.002096452391428316 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,pearson,random,8,4,0.8534145445088147,0.007033997470343221 | |
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,0,-0.017485869096098686,0.9672206778351959 | |
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,1,-0.06826285140114943,0.8724042132624071 | |
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,2,-0.27291992568490936,0.5131179718629255 | |
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,3,-0.0623085741331382,0.8834734515868299 | |
aggregate,aggregate,Holmes,holmes_240829.csv,pearson,random,8,4,0.11553071904436202,0.7852997192967395 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,0,0.8743737489954189,0.004501296794893102 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,1,0.8019858294586086,0.01664169341252048 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,2,0.865218326418788,0.005519059390504801 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,3,0.9324959770534272,0.0007305971150650418 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,pearson,random,8,4,0.9578331579912773,0.00018155839890573593 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,0,-0.30992157835736617,0.4550353006304514 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,1,-0.48460771469003827,0.2235972811859595 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,2,-0.1162588388208577,0.78397092283469 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,3,-0.03180360013624742,0.9404084479868535 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,pearson,random,8,4,-0.6310234888301745,0.09339585968843296 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,0,0.5719061307929368,0.1385541569597628 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,1,-0.2953447949582872,0.47758892197811004 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,2,-0.08547114468780825,0.8405203853999355 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,3,-0.02680948636066538,0.9497562944796989 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,pearson,random,8,4,-0.4016145018471783,0.32402730112296474 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,0,0.7247956777996108,0.04194484960329344 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,1,0.2767660595168839,0.5069548295866992 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,2,0.3337223270100439,0.4191769676693079 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,3,0.6126891094585267,0.10632638977302632 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,pearson,random,8,4,0.8079257463851817,0.015261307993340337 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,0,0.6647150497002838,0.07212235537894374 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,1,0.9659235574949907,9.641323857066814e-05 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,2,0.8593434484023453,0.0062437049978399314 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,3,0.7704800482268904,0.025262942539415363 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,pearson,random,8,4,0.9028773381740962,0.002126756432137772 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,0,0.748982925973149,0.032470780295939985 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,1,0.8599957450436625,0.006160409391629476 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,2,0.8718735582848011,0.004766072993988772 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,3,0.9069576656171551,0.001875739334441522 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,pearson,random,8,4,0.9502933219669614,0.00029570003340264575 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,0,0.8905328662549648,0.003016032865892646 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,1,0.5058552901713423,0.20090402274559316 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,2,0.6767432630833718,0.0652968761285632 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,3,0.7135518769682414,0.04685902831102101 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,pearson,random,8,4,0.873661116609048,0.004575776138454243 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,0,0.8775217778627072,0.004181622363896538 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,1,0.7683490298001087,0.025928082489068475 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,2,0.741463148953373,0.035258455741147623 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,3,0.7891209052525207,0.019892902878583873 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,pearson,random,8,4,0.8157900850650412,0.013547661219765379 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,0,0.8625206786227912,0.005844699973375535 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,1,0.49625129009057833,0.211004712621783 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,2,0.7482300147416783,0.0327435760119495 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,3,0.9237060456412569,0.0010476652712265917 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,pearson,random,8,4,0.8540419074377281,0.00694751386877189 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,0,0.7827735900001105,0.021632253958226707 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,1,0.7416615606437577,0.03518309274676423 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,2,0.8208959354305796,0.01250307893717913 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,3,0.9182336628416601,0.0012842298120423852 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,pearson,random,8,4,0.9199026021249039,0.0012087423991030853 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,0,0.7807842071724994,0.022196180227557687 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,1,0.6016089012086534,0.11460809097860054 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,2,0.85978308688271,0.006187486327563118 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,3,0.9471155608874564,0.00035525230596496123 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,pearson,random,8,4,0.9238574615349179,0.0010415614421426264 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.780599537830846,0.022248986205867058 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.753379355065838,0.030905705190702806 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8379676352721162,0.009384640911630616 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8462209992405952,0.008075105621350536 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9020771423654268,0.0021784040615750178 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9392379026634557,0.000535591367028614 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.7412355057774336,0.035345043191044964 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8195179387247324,0.01277979740900836 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.9675915145186947,8.304238414993675e-05 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.9356246311290696,0.0006351718939850358 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7466011946729814,0.03333852605723143 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.9551682330569339,0.00021776057653192886 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.46353588273705637,0.24734250900688215 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8866352243352398,0.003339629955133934 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937902652612242,0.0005710971446370687 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.2831911510498836,0.4967225093410736 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.2031844122583542,0.6293846722461313 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.8894964926830444,0.0031000020401251533 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.549284007260608,0.15849945140105312 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.7889373199563972,0.01994193933246426 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.9415411104598773,0.00047780769988844555 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.8981158348442198,0.0024460728519243077 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.7526431927239958,0.0311644661156264 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.8962925022649735,0.0025761063553240114 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.937590300147702,0.0005796196796032962 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.5831241321997315,0.12921116102954364 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.5561145441014004,0.1523217142123119 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.5664450708720614,0.14323389729888122 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.47517181530974595,0.23407895750101468 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.718855715365913,0.04449992445427745 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,0,0.7168604276016974,0.04537877960385103 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,1,0.18264726732113173,0.6650765454064547 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,2,0.04614314940391431,0.9136043258512831 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,3,0.6369093478690498,0.08944819108801377 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,pearson,random,8,4,0.8650362997962656,0.005540656777637369 | |
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,0,0.9481614738377944,0.00033485605767966255 | |
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,1,0.8579024362848122,0.006430262194723998 | |
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,2,0.9674751054383679,8.39330376548511e-05 | |
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,3,0.9711920829273566,5.848502027941985e-05 | |
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,pearson,random,8,4,0.9121630061872308,0.0015845787994022296 | |
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.7303458809128464,0.03963972108447683 | |
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7466964409211542,0.03330355520543848 | |
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8886798251454765,0.0031672235640011434 | |
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.9036719475219376,0.002076262347775526 | |
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.7623592248502944,0.02785522986224059 | |
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,0,0.8902509919824877,0.0030387234498153886 | |
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,1,0.8349964637145074,0.009887030967730168 | |
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,2,0.9513669166922365,0.00027717775621958416 | |
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,3,0.979588964641596,2.0934517813580252e-05 | |
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,pearson,random,8,4,0.8898917220751776,0.0030678038612609354 | |
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,0,0.8000397965603336,0.01711033114623395 | |
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,1,0.7666453684194998,0.026467542617941944 | |
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,2,0.8751438663188438,0.004421691058140597 | |
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,3,0.8954496186826447,0.0026376993343606783 | |
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,pearson,random,8,4,0.8195357136433342,0.012776203631959988 | |
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,0,0.8973997559676354,0.0024966210305528294 | |
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,1,0.9238541898435834,0.0010416930833947954 | |
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,2,0.9232578806881373,0.0010658683179569461 | |
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,3,0.9466806411756816,0.00036396834317210526 | |
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,pearson,random,8,4,0.9402048459613361,0.0005108048313780666 | |
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,0,0.7428545649568395,0.03473202812850355 | |
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,1,0.8962239297969814,0.0025810820467571426 | |
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,2,0.9528032040825007,0.0002536158007562822 | |
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,3,0.8136140570811612,0.01400900062666989 | |
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,pearson,random,8,4,0.5749045753814719,0.13602130778385005 | |
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,0,0.780595487125304,0.022250145374352125 | |
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8389921086523722,0.009215256295109017 | |
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,2,0.8803463320171083,0.003907570379771439 | |
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,3,0.7142670311425445,0.04653663665491792 | |
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,pearson,random,8,4,0.7977979460712193,0.017660348313797546 | |
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,0,0.7240026280446691,0.04228069432019545 | |
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,1,0.8051290094703403,0.01590190576987268 | |
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,2,0.9226246952938778,0.0010919364406592675 | |
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,3,0.770582228125362,0.025231318204288148 | |
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,pearson,random,8,4,0.5188109005585113,0.18769119165787862 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,0,0.9642212364414142,0.00011145218096014672 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,1,0.7836454491081474,0.021387948565361206 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,2,0.865235745718993,0.005516995432107779 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,3,0.819500116935474,0.012783401302719894 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,pearson,random,8,4,0.7432637726714306,0.034578129186903464 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,0,0.9603201312455674,0.00015157780411521223 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,1,0.9604114108423772,0.00015054459028416203 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,2,0.9519258192529104,0.00026784516618954716 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,3,0.9352773832366816,0.0006453340323628832 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,pearson,random,8,4,0.9756845534259928,3.5288470321501036e-05 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,0,0.6730282904268812,0.06736225845470355 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,1,0.9361725603565639,0.0006193510978979659 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,2,0.8618105831276622,0.005932414266978994 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,3,0.9371490197710903,0.0005918014940797798 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,pearson,random,8,4,0.8046621876144952,0.01601044603512172 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,0,0.41770329390345684,0.30313696659492734 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,1,0.6529975286213465,0.07915856325659755 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,2,0.6950517775314824,0.05566978580633573 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,3,0.5130382972054114,0.19351964488420637 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,pearson,random,8,4,0.6825577913683614,0.062140382561143265 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,0,0.9059635004669196,0.0019350193188838174 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,1,0.8702987510549938,0.00493787146977232 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,2,0.8349295032906534,0.009898545248446817 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,3,0.8245663895988613,0.011784555837564846 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,pearson,random,8,4,0.9186996315597573,0.0012628532368153516 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,0,0.899783088468177,0.002330962388754791 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,1,0.8724919719311256,0.004699674798249593 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,2,0.9486250828884353,0.00032606741963897914 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,3,0.9264530754805538,0.0009405124032405977 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,pearson,random,8,4,0.926933634016331,0.000922537739358256 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,0,0.6984411569502376,0.05398723363884652 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,1,0.754828418128203,0.03040022622820331 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,2,0.5655988276473191,0.14396676855997925 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,3,0.9407474980820671,0.000497230334167822 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,pearson,random,8,4,0.770589245932409,0.025229147116181697 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,0,0.7775815292717585,0.023123063813025962 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,1,0.5611200837416681,0.14787988852194642 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,2,0.34646366697352105,0.40049416986179387 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,3,0.7868643731535557,0.020500867535993103 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,pearson,random,8,4,0.8114670933196435,0.014473750045325934 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,0,0.4013581254554363,0.32436552572418753 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,1,0.28341806840646894,0.4963625961904983 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,2,0.3139211847524032,0.44892434309679713 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,3,0.2606167560977108,0.5330194398770082 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,pearson,random,8,4,0.32260154615753545,0.43577896021471924 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,0,0.7827817854375669,0.021629949458519884 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,1,0.9421767369217469,0.0004626159242720608 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,2,0.5386185630062554,0.16841388744478442 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,3,0.7045551126623175,0.05103000019308416 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,pearson,random,8,4,0.8414540075802577,0.00881618884168942 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,0,0.8748256107732684,0.0044544778532186755 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,1,0.8614522174161048,0.005976999431835443 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,2,0.7878166990611953,0.02024289628983945 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,3,0.8381151096374623,0.009360136935052572 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,pearson,random,8,4,0.876154278920616,0.0043186280005204514 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,0,0.9802952193136,1.884578972104051e-05 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,1,0.9841937367574427,9.755845662836177e-06 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,2,0.8661864185981796,0.005405102460401999 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,3,0.8297856426405835,0.010808669505560614 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,pearson,random,8,4,0.9329487606730291,0.000716243089312378 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,0,0.822202489777381,0.01224422861798353 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,1,0.6968865871905413,0.05475511707469452 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,2,0.9318897100616549,0.0007501099193828288 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,3,0.7939152572032528,0.018638835543465734 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,pearson,random,8,4,0.7761614135775217,0.02354161442763604 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,0,0.9079242687040253,0.0018192466167481706 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,1,0.5935991848770941,0.12081484777974201 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,2,0.96841302674998,7.693398893847449e-05 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,3,0.9131963004520903,0.001530535130781307 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,pearson,random,8,4,0.7594573765014532,0.02881968270449265 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,0,0.6622792441367216,0.07355344210000651 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,1,0.5835165093102912,0.1288909419896904 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,2,0.7271748558955601,0.04094703171178795 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,3,0.7369082697183147,0.0370157216672518 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,pearson,random,8,4,0.7219159720057066,0.04317213020613491 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,0,0.8973595810319037,0.002499476856786579 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,1,0.6540145328427245,0.07853263145320354 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,2,0.9470816844896075,0.0003559262259996983 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,3,0.798793471524343,0.017414760604056785 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,pearson,random,8,4,0.766501585020503,0.026513385703318352 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,0,0.6776894663079587,0.06477689572321889 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,1,0.6576248245381009,0.07633405000799688 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,2,0.796342090311639,0.018023378799051942 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,3,0.689140856921657,0.058678219175095074 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,pearson,random,8,4,0.6705942614169457,0.06873614015066103 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,0,0.6842754194067544,0.0612256583562849 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,1,0.7338112096805872,0.03824046140795786 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,2,0.8786344078919507,0.0040722405599500165 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,3,0.8914863638509409,0.0029400900210167272 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,pearson,random,8,4,0.8522000994286094,0.007203358614415384 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,0,0.7479170810940026,0.03285737031031745 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,1,0.5899049701184135,0.1237398240474465 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,2,0.864013241961245,0.005663050469813282 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,3,0.726560560314063,0.04120326937800088 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,pearson,random,8,4,0.7600546147835674,0.02861953111724766 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,0,0.8675817638279608,0.00524352512595729 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,1,0.4358953069712842,0.280322780055143 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,2,0.8724977849323057,0.004699053502733089 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,3,0.871502377377448,0.004806214049293794 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,pearson,random,8,4,0.636462032322589,0.08974474991245225 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,0,0.7407371067623334,0.035535069908202585 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,1,0.13754152986907456,0.7453436298315592 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,2,0.8584434869588686,0.006359804257501524 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,3,0.9096718109287911,0.0017199423212977748 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,pearson,random,8,4,0.429513562091493,0.2882272134157949 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,0,0.7258395762861067,0.04150524782255408 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,1,0.4140057077993773,0.3078793667149351 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,2,0.8860840192325219,0.003387122941063616 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,3,0.8531999374729967,0.007063738601380546 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,pearson,random,8,4,0.570698753672453,0.13958138247636556 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,0,0.9462124246513754,0.00037350751375720304 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,1,0.820982530302196,0.012485817170678851 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,2,0.9284819872198913,0.0008661544234609058 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,3,0.9226572389021586,0.0010905865909148318 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,pearson,random,8,4,0.8996834645928126,0.0023377397968761906 | |
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,0,0.9806889787900566,1.77437080791335e-05 | |
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,1,0.9467481050448351,0.00036260722071780783 | |
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,2,0.9051882617143683,0.001982079878231783 | |
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,3,0.8448816290057799,0.008279149903754354 | |
aggregate,aggregate,OpenCompass,opencompass_240829.csv,pearson,random,8,4,0.9486969514405281,0.0003247187445212263 | |
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,0,0.7138885174194392,0.046707103452906885 | |
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,1,0.40763933138747765,0.3161269846214854 | |
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,2,0.5033557119680766,0.20350786972733814 | |
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,3,0.4943676910774294,0.21301612937354739 | |
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,pearson,random,8,4,0.3662549994154035,0.3722134961617391 | |
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,0,0.6943274080319848,0.05603338677616118 | |
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,1,0.888202282224346,0.0032069637473251308 | |
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,2,0.862959786938574,0.0057908774192851585 | |
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,3,0.4422315456206938,0.2725814015162671 | |
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,pearson,random,8,4,0.9314197867245828,0.0007654668867563735 | |
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,0,0.8576726697477571,0.006460333718352682 | |
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,1,0.6477798867796105,0.08241558395766836 | |
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,2,0.7105249096891054,0.04823848031855015 | |
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,3,0.7433756448219943,0.034536127920169364 | |
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,pearson,random,8,4,0.465629371128827,0.24492880327618063 | |
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,0,0.9815968610969954,1.5367458655827867e-05 | |
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,1,0.9215279351913577,0.0011380681078154023 | |
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,2,0.9028698976709195,0.0021272329705264844 | |
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,3,0.8115257987039834,0.014460915122317916 | |
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,pearson,random,8,4,0.8840656907304268,0.003564741739845647 | |
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,0,0.9288767434076772,0.0008521494712455959 | |
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,1,0.8762491857760322,0.004309027650395265 | |
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,2,0.822174167720692,0.012249803466994006 | |
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,3,0.8388480886223416,0.009238949980481774 | |
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,pearson,random,8,4,0.9318866818637482,0.0007502082286076188 | |
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,0,0.6752208316271633,0.06613869004956173 | |
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,1,0.7677373687773497,0.026120973578910495 | |
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,2,0.7919204265038193,0.01915443839404165 | |
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,3,0.8238198607264919,0.01192852239680578 | |
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,pearson,random,8,4,0.8788769140000767,0.0040486473187813605 | |
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,0,0.5937971020205063,0.1206592532108973 | |
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,1,0.6743688104667733,0.0666125934693148 | |
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,2,0.6092910701405022,0.10882867605607495 | |
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,3,0.02436876480189197,0.954326651607438 | |
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,pearson,random,8,4,0.7114255278499215,0.04782552820112736 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,0,0.5887872724291499,0.12463254240428198 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,1,0.4029552549015283,0.32226121873409685 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,2,0.19589220319331574,0.6419903458052949 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,3,0.5147894627560958,0.1917415408232741 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,pearson,random,8,4,0.43696792691727815,0.2790047957490856 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,0,0.9683600812057522,7.731839943750683e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,1,0.9490060035318915,0.00031896092810029624 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,2,0.9033732116949054,0.0020951534061901173 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,3,0.9728319200142996,4.911626350007423e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,pearson,random,8,4,0.9174158952141087,0.0013223130420052574 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,0,0.8698029729880158,0.00499276771087744 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,1,0.9736499620869766,4.483954353741208e-05 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,2,0.8938963574061565,0.002753683842916408 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,3,0.9427230009399408,0.00044981624708065733 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,pearson,random,8,4,0.9288091831587435,0.0008545357544848401 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,0,0.9876650170257133,4.648675321533348e-06 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,1,0.9200698352872445,0.0012013420941124318 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,2,0.8197843971795349,0.012725991028944833 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,3,0.9667731014329254,8.943826166773405e-05 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,pearson,random,8,4,0.9135236868955329,0.0015136659995374103 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,0,0.983826044072315,1.0449743172360012e-05 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,1,0.9482689395026054,0.000332805134027447 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,2,0.9334433471484072,0.0007007762613840839 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,3,0.8998371432675459,0.0023272903802322954 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,pearson,random,8,4,0.9131450099069247,0.0015331889972515346 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,0,0.9445409047411082,0.00040889964932544416 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,1,0.8996453255999854,0.00234033776853281 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,2,0.8662449830102448,0.005398257529969565 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,3,0.9506955154682739,0.00028866872380162265 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,pearson,random,8,4,0.9121357775980045,0.0015860194531010332 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,0,0.9469225816315634,0.000359102582060145 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,1,0.790872393374341,0.019428850798750914 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,2,0.7384692720332464,0.03640761031575469 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,3,0.9396936265489109,0.0005238133760109684 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,pearson,random,8,4,0.7853349194194776,0.020919442242219075 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,0,0.8636070293544758,0.005712124057773506 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,1,0.837126038633602,0.009525258316342535 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,2,0.7663953319208139,0.026547294337781743 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,3,0.8834569465544357,0.00361946726545403 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,pearson,random,8,4,0.8480938359553485,0.00779520658099071 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,0,0.9882164477730901,4.05436289119973e-06 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,1,0.9749878899040407,3.838912250625781e-05 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,2,0.9189017807616305,0.0012536521795481071 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,3,0.976785228034165,3.073554131266073e-05 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,pearson,random,8,4,0.9683736529744773,7.721974100004276e-05 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,0,0.8793267175321069,0.004005119722136405 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,1,0.8760721346635911,0.004326948446281908 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,2,0.9315137258308156,0.0007623806815109492 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,3,0.9671655908223616,8.633181797191984e-05 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,pearson,random,8,4,0.7675767218262903,0.026171781192995118 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,0,0.8483878251754778,0.007751839541749867 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,1,0.9222607240796445,0.0011071076795417618 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,2,0.9440994017259922,0.00041860181264251746 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,3,0.9640433681068886,0.00011310737614553013 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,pearson,random,8,4,0.692434840005101,0.056990052908859494 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,0,0.9942767822652612,4.6665376445687894e-07 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,1,0.9682235346488557,7.831565067564543e-05 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,2,0.9370054660599566,0.0005958002530390111 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,3,0.969420946106877,6.985512173523951e-05 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,pearson,random,8,4,0.9834828472581691,1.1126279772397877e-05 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,0,0.9928216304628095,9.197638948465057e-07 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,1,0.9611899818187688,0.00014192004448559492 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,2,0.9411758308443503,0.0004866843681750784 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,3,0.9688368521395198,7.390226580769654e-05 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,pearson,random,8,4,0.9723616916410369,5.16925798887181e-05 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,0,0.9766036636486001,3.14580315476573e-05 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,1,0.9597878054141521,0.00015769662952759886 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,2,0.9404428288332258,0.0005048221249291256 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,3,0.9831715348590928,1.176456701375346e-05 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,pearson,random,8,4,0.97187689823272,5.4440740892278444e-05 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,0,0.9852421877364517,7.946695487913594e-06 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,1,0.9692179758222269,7.124441373542135e-05 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,2,0.9412248237761267,0.000485487558057933 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,3,0.98025276424875,1.8967257174977277e-05 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,pearson,random,8,4,0.9846373995357367,8.960181355366343e-06 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,0,0.9522400671025366,0.0002626898916961467 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,1,0.9732730607216835,4.677795327851405e-05 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,2,0.9110081304703664,0.001646433879397326 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,3,0.9433518650586681,0.0004353717167521428 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,pearson,random,8,4,0.880586328075459,0.003884834219553849 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9355663499255871,0.0006368701046576545 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9499604642147754,0.0003016036750416735 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7164442699126142,0.04556339297891151 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5643812833359342,0.14502482192576685 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.4448334653124403,0.269433453257965 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.9020957808919513,0.002177191904645508 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9140262325400854,0.0014880077902407654 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6613543728531551,0.07410115498793113 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.4797794956768499,0.2289297958345603 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.49503702005526434,0.21230024172428238 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8658004484348707,0.005450353400185282 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9239450258900821,0.0010380421984977164 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.6878185417270377,0.05936418242167244 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.6427492187377651,0.08562857067256696 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.19987101474191585,0.6351028985023905 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.7695981699173929,0.025536900476404875 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.851160886507116,0.00735033097799936 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7535063061583401,0.030861215825263487 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.26946310602236634,0.5186811891252074 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5071239778851739,0.19958915881626008 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.845558834843199,0.00817557674320208 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8223598748455347,0.01221327849153134 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7520379034546343,0.03137821860478068 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.5986152394502113,0.1169062576526029 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.766509325140422,0.026510916638992615 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.6388656044215879,0.08815791552969902 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.8220592376168137,0.012272442496278822 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.1610992186087647,0.7031245257171708 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.22938177579714764,0.584757473087143 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.16217150942988084,0.7012176634258844 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,0,0.8536693780854105,0.0069987855857581984 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,1,0.9079591032101378,0.0018172316533511903 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,2,0.7448797028215589,0.033974472983626124 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,3,0.436470242791583,0.2796159471960331 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,pearson,random,8,4,0.5113717481429286,0.195219904727713 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,0,0.8848684214582546,0.0034933971141531536 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,1,0.9247518427204778,0.0010059807632682822 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,2,0.7024798803756629,0.05202256738347333 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,3,0.6111548412929141,0.10745210550108082 | |
aggregate,aggregate,BFCL,bfcl_240906.csv,pearson,random,8,4,0.8864983521119945,0.0033513827582610342 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,0,0.8443252756395498,0.008364861793357709 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,1,0.8199557285303699,0.012691469447090417 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,2,0.6898121736766818,0.05833178396126367 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,3,0.1445400076243653,0.732738456710739 | |
aggregate,aggregate,BIGGEN,biggen_240829.csv,pearson,random,8,4,-0.13444519427677581,0.7509364951619687 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,0,0.9461712339012929,0.00037435448514068834 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,1,0.8543556725359636,0.006904516600543572 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,2,0.7671160990392422,0.026317800283773948 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,3,0.4230508906614041,0.29634091151848907 | |
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,pearson,random,8,4,0.29492042180464345,0.478252042515081 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,0,0.8192056092552416,0.01284304904344425 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,1,0.8053230426409881,0.015856927546595193 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,2,0.6785867773117831,0.06428605698561919 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,3,0.021028776761034942,0.960582665935811 | |
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,pearson,random,8,4,-0.25337930013147175,0.5448562000018814 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,0,0.8101772449555595,0.014757563523095152 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,1,0.7844308170919763,0.021169355122089707 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,2,0.6407686957715764,0.08691312009391092 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,3,0.042093006210129874,0.9211687904012325 | |
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,pearson,random,8,4,-0.2813292229519864,0.4996795026573654 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,0,0.8350456630970934,0.00987857623206292 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,1,0.879311548672376,0.004006582681021272 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,2,0.6951300585252861,0.0556305769370549 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,3,0.30955291195703166,0.4556002793087552 | |
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,pearson,random,8,4,0.09897629382276267,0.8156278898050575 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,0,0.8313126956210078,0.010533178480029779 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,1,0.8169388413464165,0.01330802664448977 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,2,0.8065284450649773,0.015579295379409611 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,3,0.23722382427262312,0.5716108619128892 | |
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,pearson,random,8,4,0.026088426326565897,0.9511063910298649 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,0,0.5558829816104426,0.15252894598370506 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,1,0.6390946692796851,0.08800754271923365 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,2,0.24121345447897227,0.5649619826999719 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,3,-0.13262144042688304,0.7542351704927408 | |
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,pearson,random,8,4,-0.46784288126219703,0.24238975539995447 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,0,0.7467577882406231,0.03328104267130768 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,1,0.7611545287510072,0.028253164658278467 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,2,0.6541774611460981,0.07843262445172178 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,3,0.0830822493170678,0.8449361587214159 | |
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,pearson,random,8,4,-0.1985934514676979,0.6373119372341151 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,0,0.9103256104990007,0.001683717098370581 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,1,0.8079204807250888,0.015262498588799642 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,2,0.7253154362419392,0.0417256201301186 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,3,0.2776474358858506,0.5055464711128136 | |
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,pearson,random,8,4,-0.04029159995291984,0.9245349726533298 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,0,0.919432996814919,0.0012296819224052442 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,1,0.87005129824662,0.004965222567299112 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,2,0.9073703100625691,0.001851485138509531 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,3,0.8673887162219034,0.005265692212272121 | |
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,pearson,random,8,4,0.8916723527123611,0.0029254223429427636 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,0,0.9804801069360884,1.832282630082123e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,1,0.980051779203359,1.9549343460335766e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,2,0.9720131442366731,5.3658869462094946e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,3,0.9165887813382055,0.001361572704071016 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,pearson,random,8,4,0.9225103255266087,0.0010966889416837342 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,0,0.9292369266176062,0.000839501038985727 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,1,0.9505492134066896,0.00029121355501060477 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,2,0.9415690777822339,0.00047713248045663163 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,3,0.9576750897378552,0.00018358576102437457 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,pearson,random,8,4,0.8850761460392197,0.0034750864462593195 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,0,0.9798647388383346,2.0101576768271062e-05 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,1,0.9781250835045174,2.5741076148769547e-05 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,2,0.9598475365356987,0.00015700207944980397 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,3,0.9317002702003969,0.000756276259880365 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,pearson,random,8,4,0.8240635545541923,0.011881405061211926 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,0,0.9849433970479835,8.437305784682183e-06 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,1,0.9899107226768695,2.548168158279175e-06 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,2,0.9645217100316719,0.00010869253777108847 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,3,0.9447465624679983,0.00040443116308794275 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,pearson,random,8,4,0.8760879368136391,0.0043253470355424355 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,0,0.9469408250476264,0.0003587374254477132 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,1,0.9498225876442147,0.000304071618749767 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,2,0.9413785598975157,0.0004817446027243596 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,3,0.8197292667265523,0.012737111858293043 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,pearson,random,8,4,0.9057861973602506,0.0019457176947306907 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,0,0.9413025091864188,0.000483593804288479 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,1,0.9083254977326705,0.001796125778484392 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,2,0.8626635526406192,0.005827152548807454 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,3,0.8043418970652331,0.016085184583393794 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,pearson,random,8,4,0.8946872852632068,0.0026942203148939193 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,0,0.9025950086780581,0.002144887259438991 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,1,0.7564264003460613,0.02984872863501939 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,2,0.9033527343998258,0.002096452391428316 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,3,0.8494277893147777,0.0075996673267298715 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,pearson,random,8,4,0.8534145445088147,0.007033997470343221 | |