safetybat / cache_old /agreements_cache_1b58bbc4e0d124b0a524da1001369741.csv
jbnayahu's picture
clean cache
131e411
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.07142857142857142,0.9048611111111111
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2857142857142857,0.39875992063492066
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7637626158259734,0.008839740160738534
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9092412093166348,0.0018276750354536814
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3401680257083045,0.25175949861106117
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.18184824186332696,0.5330356744917513
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2545875386086578,0.38281014365989596
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.10910894511799618,0.7083840532183997
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5929994533288809,0.04437842734548688
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5455447255899809,0.0614649096074132
OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6910233190806425,0.017844011512848347
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3706246583305506,0.20891238174069848
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6182840223353117,0.0340492747686748
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.22237479499833035,0.45088703102517036
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2545875386086578,0.38281014365989596
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.14285714285714285,0.7195436507936508
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9092412093166348,0.0018276750354536814
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066
OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2857142857142857,0.39875992063492066
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.836501912571304,0.004136737098676645
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9819805060619657,0.0007619896395304237
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,-0.07142857142857142,0.9048611111111111
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.47280542884465016,0.10506382347888965
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066
BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637
BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.18184824186332696,0.5330356744917513
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.10910894511799618,0.7083840532183997
BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.40006613209931935,0.17023995462900499
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.42857142857142855,0.17886904761904762
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066
BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.0,1.0
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.07142857142857142,0.9048611111111111
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.14285714285714285,0.7195436507936508
BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.42857142857142855,0.17886904761904762
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985
BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.2545875386086578,0.38281014365989596
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.18184824186332696,0.5330356744917513
BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.21428571428571427,0.5484126984126985
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6910233190806425,0.017844011512848347
BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.836501912571304,0.004136737098676645
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7637626158259734,0.008839740160738534
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6182840223353117,0.0340492747686748
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,3,-0.21428571428571427,0.5484126984126985
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,4,-0.07142857142857142,0.9048611111111111
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,0.0,1.0
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,4,0.2857142857142857,0.39875992063492066
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,3,0.7637626158259734,0.008839740160738534
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,4,0.9092412093166348,0.0018276750354536814
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3401680257083045,0.25175949861106117
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.0,1.0
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.18184824186332696,0.5330356744917513
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.2545875386086578,0.38281014365989596
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.10910894511799618,0.7083840532183997
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.5929994533288809,0.04437842734548688
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,3,0.5455447255899809,0.0614649096074132
aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,4,0.6910233190806425,0.017844011512848347
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,2,0.3706246583305506,0.20891238174069848
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,3,0.6182840223353117,0.0340492747686748
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.22237479499833035,0.45088703102517036
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,4,0.2545875386086578,0.38281014365989596
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,4,0.14285714285714285,0.7195436507936508
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.9092412093166348,0.0018276750354536814
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.0,1.0
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066
aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,4,0.2857142857142857,0.39875992063492066
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.836501912571304,0.004136737098676645
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.9819805060619657,0.0007619896395304237
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,-0.07142857142857142,0.9048611111111111
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.47280542884465016,0.10506382347888965
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066
aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,4,-0.3571428571428571,0.27509920634920637
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637
aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,4,0.18184824186332696,0.5330356744917513
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,3,0.10910894511799618,0.7083840532183997
aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,4,-0.40006613209931935,0.17023995462900499
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,4,-0.42857142857142855,0.17886904761904762
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066
aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,4,0.0,1.0
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,4,-0.07142857142857142,0.9048611111111111
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,3,-0.14285714285714285,0.7195436507936508
aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,4,-0.42857142857142855,0.17886904761904762
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985
aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,4,-0.2545875386086578,0.38281014365989596
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,3,0.18184824186332696,0.5330356744917513
aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,4,-0.21428571428571427,0.5484126984126985
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,3,0.6910233190806425,0.017844011512848347
aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,3,0.836501912571304,0.004136737098676645
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,3,0.7637626158259734,0.008839740160738534
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,3,0.6182840223353117,0.0340492747686748
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762