Spaces:
Running
Running
Yotam Perlitz
commited on
Commit
•
05398d1
1
Parent(s):
0f8e886
git commit finalize app
Browse filesSigned-off-by: Yotam Perlitz <[email protected]>
- .gitignore +3 -0
- app.py +384 -88
- assets/ablations.png +0 -0
- assets/combined_holistic.csv +0 -825
- assets/livebench.csv +0 -365
- assets/pointplot_granularity_matters.png +0 -0
- cache/agreements_cache_42471fdfe00c7ff9b0aba18b66ab5a5f.csv +73 -0
- cache/agreements_cache_6ac32881b7d0a3bf6d8762ff242ff449.csv +721 -0
- cache/agreements_cache_9aca1000dd25da3a044f5fd80fad0266.csv +721 -0
- cache/agreements_cache_a8b645e4d5ba862fbfa9ef3ecf73b44c.csv +721 -0
- cache/agreements_cache_facdc1028ee0edd9aed491afc51b884d.csv +73 -0
- requirements.txt +3 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.vscode/launch.json
|
2 |
+
.vscode/settings.json
|
3 |
+
.DS_Store
|
app.py
CHANGED
@@ -1,117 +1,413 @@
|
|
1 |
-
import
|
|
|
|
|
2 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import pandas as pd
|
7 |
from bat import Tester, Config, Benchmark, Reporter
|
8 |
from bat.utils import get_holistic_benchmark
|
9 |
|
10 |
-
|
11 |
cfg = Config(
|
12 |
exp_to_run="example",
|
13 |
n_models_taken_list=[0],
|
14 |
model_select_strategy_list=["random"],
|
15 |
-
n_exps=10
|
16 |
-
# reference_data_path="data/combined_holistic.csv",
|
17 |
)
|
|
|
18 |
|
|
|
19 |
|
20 |
-
|
21 |
-
new_bench_agg_name = f"{newbench_name}_mwr"
|
22 |
|
|
|
23 |
tester = Tester(cfg=cfg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
# reference_benchmark=get_holistic_benchmark(), n_models=20
|
27 |
-
# )
|
28 |
|
|
|
|
|
29 |
newbench = Benchmark(
|
30 |
-
pd.read_csv(f"assets/{newbench_name}.csv"),
|
31 |
data_source=newbench_name,
|
32 |
)
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
# newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
|
36 |
|
|
|
|
|
|
|
|
|
37 |
reporter = Reporter()
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
41 |
|
42 |
-
holistic
|
43 |
-
holistic.add_aggragete(new_col_name="aggregate", agg_source_name="holistic")
|
44 |
|
45 |
-
|
|
|
46 |
allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
|
|
|
|
|
|
|
47 |
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
all_agreements = run_load()
|
55 |
-
|
56 |
-
observed_scenario = "arena_elo" # "livebench_lb"
|
57 |
-
blacklist_sources = [] # "livebench"
|
58 |
-
|
59 |
-
z_score = reporter.get_z_score(all_agreements, observed_scenario, blacklist_sources)
|
60 |
-
|
61 |
-
st.write(f"zscore of {observed_scenario}: {z_score}")
|
62 |
-
|
63 |
-
# df = pd.read_csv("BAT_w_arena_10_random.csv")
|
64 |
-
# df = (
|
65 |
-
# (
|
66 |
-
# df.rename(
|
67 |
-
# columns={
|
68 |
-
# "z_score": "Z_Score",
|
69 |
-
# "benchmark": "Benchmark",
|
70 |
-
# }
|
71 |
-
# ).drop(
|
72 |
-
# columns=[
|
73 |
-
# "Unnamed: 0",
|
74 |
-
# "z_test_pass",
|
75 |
-
# ]
|
76 |
-
# )
|
77 |
-
# )
|
78 |
-
# .sort_values("Z_Score", ascending=False)
|
79 |
-
# .query(
|
80 |
-
# 'Benchmark!="Aggregate" and Benchmark!="MAGI" and Benchmark!="Alpaca(v2, len adj)" and Benchmark!="GPT4All"'
|
81 |
-
# )
|
82 |
-
# )
|
83 |
-
|
84 |
-
|
85 |
-
# df.replace(
|
86 |
-
# {
|
87 |
-
# "Arena Elo": "LMSys Arena",
|
88 |
-
# "Hugging-6": "HF OpenLLM",
|
89 |
-
# "Alpaca(v2)": "Alpaca v2",
|
90 |
-
# "Alpaca(v1)": "Alpaca v1",
|
91 |
-
# "EQ-Bench(v2)": "EQ-Bench v2",
|
92 |
-
# },
|
93 |
-
# inplace=True,
|
94 |
-
# )
|
95 |
-
|
96 |
-
# col1, col2, col3 = st.columns(3)
|
97 |
-
|
98 |
-
# with col1:
|
99 |
-
# st.header(" Agree")
|
100 |
-
# st.dataframe(df.query("Z_Score>=0"), hide_index=True)
|
101 |
-
|
102 |
-
# with col2:
|
103 |
-
# st.header(" Disagree")
|
104 |
-
# st.dataframe(df.query("Z_Score<0").sort_values("Z_Score"), hide_index=True)
|
105 |
-
|
106 |
-
# with col3:
|
107 |
-
# st.header(" Configs")
|
108 |
-
# # st.selectbox(label="Reference Benchmarks", options=["LMSys Arena"])
|
109 |
-
# options = st.multiselect(
|
110 |
-
# "Reference Benchmarks",
|
111 |
-
# ["LMSys Arena", "Open Compass", "Yellow", "Red", "Blue"],
|
112 |
-
# ["LMSys Arena", "Open Compass"],
|
113 |
-
# )
|
114 |
-
# st.selectbox(label="# models compared", options=[20])
|
115 |
-
# st.selectbox(label="Model Select Strategy", options=["Random"])
|
116 |
-
# st.write("")
|
117 |
-
# st.button("Upload a new benchmark")
|
|
|
1 |
+
import hashlib
|
2 |
+
import os
|
3 |
+
|
4 |
import pandas as pd
|
5 |
+
import plotly.express as px
|
6 |
+
import streamlit as st
|
7 |
+
from bat import Benchmark, Config, Reporter, Tester
|
8 |
+
from bat.utils import get_holistic_benchmark
|
9 |
+
|
10 |
+
benchmarks_dict = {
|
11 |
+
"arena_elo": "LMSys Arena",
|
12 |
+
"mt_bench": "MT Bench",
|
13 |
+
"mixeval": "Mix Eval",
|
14 |
+
"alpacav2": "AlpacaEval V2",
|
15 |
+
"arena_hard": "Arena Hard",
|
16 |
+
"arc_c": "ARC-C",
|
17 |
+
"eq_benchv2": "EQ Bench V2",
|
18 |
+
"agieval": "AGIEval",
|
19 |
+
"llmonitor": "LLMonitor",
|
20 |
+
"bbh": "BBH",
|
21 |
+
"mmlu": "MMLU",
|
22 |
+
"alpacav1": "AlpacaEval V1",
|
23 |
+
"magi": "MAGI",
|
24 |
+
"alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
|
25 |
+
"gpt4all": "GPT-4-All",
|
26 |
+
"humaneval": "HumanEval",
|
27 |
+
"mbpp": "MBPP",
|
28 |
+
"hellaswag": "HellaSwag",
|
29 |
+
"hugging_6": "HF OpenLLM V1",
|
30 |
+
"winogrande": "Winogrande",
|
31 |
+
}
|
32 |
+
|
33 |
+
st.markdown(
|
34 |
+
"""<h1 style='text-align: center; color: black;'>🏋️♂️ BenchBench Leaderboard 🏋️♂️</h1>""",
|
35 |
+
unsafe_allow_html=True,
|
36 |
+
)
|
37 |
+
|
38 |
+
st.markdown(
|
39 |
+
"We are excited to share the BenchBench-Leaderboard, a crucial component of our comprehensive research on Benchmark Agreement Testing (BAT) [work](#). "
|
40 |
+
"This leaderboard is a meta-benchmark that ranks benchmarks based on their agreement with the crowd harnessing many different references. "
|
41 |
+
)
|
42 |
+
|
43 |
+
|
44 |
+
all_scenarios_for_aggragate = get_holistic_benchmark().get_scenarios()
|
45 |
+
|
46 |
+
st.subheader("The Leaderboard", divider=True)
|
47 |
+
# st.subheader("🏋️♂️ BenchBench Leaderboard 🏋", divider=True)
|
48 |
+
|
49 |
+
leftcol, rightcol = st.columns([2, 1])
|
50 |
+
with leftcol:
|
51 |
+
with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
|
52 |
+
with st.form("my_form"):
|
53 |
+
all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
|
54 |
+
all_scenarios_for_aggragate_with_all.append("All Holistic")
|
55 |
+
|
56 |
+
aggragate_scenarios = st.multiselect(
|
57 |
+
"Scenarios in Aggregate",
|
58 |
+
all_scenarios_for_aggragate_with_all,
|
59 |
+
["All Holistic"],
|
60 |
+
# all_scenarios_for_aggragate,
|
61 |
+
)
|
62 |
+
|
63 |
+
corr_type = st.selectbox(
|
64 |
+
label="Select Correlation type", options=["kendall", "pearson"], index=0
|
65 |
+
)
|
66 |
+
|
67 |
+
aggragate_scenario_blacklist = (
|
68 |
+
[
|
69 |
+
scen
|
70 |
+
for scen in all_scenarios_for_aggragate
|
71 |
+
if scen not in aggragate_scenarios
|
72 |
+
]
|
73 |
+
if "All Holistic" not in aggragate_scenarios
|
74 |
+
else []
|
75 |
+
)
|
76 |
+
|
77 |
+
model_select_strategy = st.selectbox(
|
78 |
+
label="Select strategy",
|
79 |
+
options=["random", "top_aggregate", "somewhere_aggregate"],
|
80 |
+
index=0,
|
81 |
+
)
|
82 |
+
|
83 |
+
n_models_taken_list = [5]
|
84 |
+
n_exps = 10
|
85 |
+
|
86 |
+
submitted = st.form_submit_button(label="Run BAT")
|
87 |
+
|
88 |
+
with rightcol:
|
89 |
+
st.button("➕ Add your benchmark here!")
|
90 |
+
|
91 |
+
|
92 |
+
def run_load(
|
93 |
+
aggragate_scenario_blacklist=[],
|
94 |
+
n_models_taken_list=[5],
|
95 |
+
model_select_strategy_list=["random"],
|
96 |
+
corr_types=["kendall"],
|
97 |
+
n_exps=10,
|
98 |
+
):
|
99 |
+
# Create a hash of the inputs to generate a unique cache file for each set of inputs
|
100 |
+
input_str = (
|
101 |
+
str(aggragate_scenario_blacklist)
|
102 |
+
+ str(n_models_taken_list)
|
103 |
+
+ str(model_select_strategy_list)
|
104 |
+
+ str(corr_types)
|
105 |
+
+ str(n_exps)
|
106 |
+
)
|
107 |
+
input_hash = hashlib.md5(input_str.encode()).hexdigest()
|
108 |
+
cache_file = f"agreements_cache_{input_hash}.csv"
|
109 |
+
|
110 |
+
# Define the cache directory
|
111 |
+
cache_dir = "cache"
|
112 |
+
cache_path = os.path.join(cache_dir, cache_file)
|
113 |
+
|
114 |
+
# Check if the cache file exists
|
115 |
+
if os.path.exists(cache_path):
|
116 |
+
print("Loading cached results...")
|
117 |
+
agreements = pd.read_csv(cache_path)
|
118 |
+
return agreements
|
119 |
+
|
120 |
+
else:
|
121 |
+
print("Cached results not found, calculating")
|
122 |
+
|
123 |
+
cfg = Config(
|
124 |
+
exp_to_run="example",
|
125 |
+
n_models_taken_list=n_models_taken_list,
|
126 |
+
model_select_strategy_list=model_select_strategy_list,
|
127 |
+
corr_types=corr_types,
|
128 |
+
n_exps=n_exps if n_models_taken_list != [0] else 1,
|
129 |
+
# reference_data_path="data/combined_holistic.csv",
|
130 |
+
)
|
131 |
+
|
132 |
+
holistic = get_holistic_benchmark()
|
133 |
+
holistic_scenarios = holistic.get_scenarios()
|
134 |
+
holistic.clear_repeated_scenarios()
|
135 |
+
holistic.add_aggragete(
|
136 |
+
new_col_name="aggregate",
|
137 |
+
agg_source_name="holistic",
|
138 |
+
scenario_blacklist=aggragate_scenario_blacklist,
|
139 |
+
min_scenario_for_models_to_appear_in_agg=5,
|
140 |
+
)
|
141 |
+
|
142 |
+
allbench = Benchmark(
|
143 |
+
pd.read_csv("assets/combined_20240704.csv"),
|
144 |
+
# data_source=newbench_name,
|
145 |
+
)
|
146 |
+
allbench.df = allbench.df.drop(columns=["tag"])
|
147 |
+
allbench.clear_repeated_scenarios()
|
148 |
+
allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
|
149 |
+
|
150 |
+
allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
|
151 |
+
allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
|
152 |
+
|
153 |
+
# st.dataframe(holistic.df.query('scenario=="aggregate"'))
|
154 |
+
|
155 |
+
allbench = allbench.extend(holistic)
|
156 |
+
|
157 |
+
tester = Tester(cfg=cfg)
|
158 |
+
|
159 |
+
# len(allbench.get_scenario_appearences_count().keys())
|
160 |
+
|
161 |
+
agreements = tester.all_vs_all_agreement_testing(
|
162 |
+
allbench, single_source_scenario="aggregate"
|
163 |
+
)
|
164 |
+
|
165 |
+
agreements.to_csv(cache_path, index=False)
|
166 |
+
|
167 |
+
return agreements
|
168 |
+
|
169 |
+
|
170 |
+
agreements = run_load(
|
171 |
+
aggragate_scenario_blacklist=aggragate_scenario_blacklist,
|
172 |
+
n_models_taken_list=n_models_taken_list,
|
173 |
+
model_select_strategy_list=[model_select_strategy],
|
174 |
+
corr_types=[corr_type],
|
175 |
+
n_exps=n_exps,
|
176 |
+
)
|
177 |
+
|
178 |
+
reporter = Reporter()
|
179 |
+
z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
|
180 |
+
|
181 |
+
corr_name = f"{'KT' if corr_type=='kendall' else 'Per.'} Corr."
|
182 |
+
|
183 |
+
z_scores["z_score"] = z_scores["z_score"].round(2)
|
184 |
+
z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
|
185 |
+
z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
|
186 |
+
|
187 |
+
data = (
|
188 |
+
z_scores.rename(
|
189 |
+
columns={
|
190 |
+
"scenario": "Benchmark",
|
191 |
+
"z_score": "Z Score",
|
192 |
+
"corr_with_agg": corr_name,
|
193 |
+
"p_value_of_corr_with_agg": "p value of Corr.",
|
194 |
+
"source": "Source",
|
195 |
+
}
|
196 |
+
)
|
197 |
+
.sort_values("Z Score", ascending=False)
|
198 |
+
.reset_index(drop=True)
|
199 |
+
)
|
200 |
+
|
201 |
+
|
202 |
+
data = data[~data["Source"].str.contains("livebench")]
|
203 |
+
data = data[~data["Source"].str.contains("biggen")]
|
204 |
+
data.drop(columns=["Source"], inplace=True)
|
205 |
+
data["Benchmark"] = data["Benchmark"].apply(lambda x: benchmarks_dict[x])
|
206 |
+
|
207 |
+
# Apply coloring based on 'Z' valuesz
|
208 |
+
|
209 |
+
styled_data = data.style.background_gradient(
|
210 |
+
subset=["Z Score"],
|
211 |
+
cmap="RdYlGn",
|
212 |
+
vmin=-data["Z Score"].abs().max(),
|
213 |
+
vmax=data["Z Score"].abs().max(),
|
214 |
+
).format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
|
215 |
+
|
216 |
+
st.dataframe(
|
217 |
+
data=styled_data,
|
218 |
+
hide_index=True,
|
219 |
+
use_container_width=True,
|
220 |
+
height=300,
|
221 |
+
)
|
222 |
+
|
223 |
+
st.markdown(
|
224 |
+
"BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
|
225 |
+
"The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
|
226 |
+
)
|
227 |
+
|
228 |
+
st.subheader("How did we get the Z Scores?", divider=True)
|
229 |
+
|
230 |
+
st.write(r"""
|
231 |
+
Section 3.1 in our work shows how using a single reference benchmark drastically hurts the roubustness and validity of BAT.
|
232 |
+
To remedy this, we propose to test benchmark agreement with an aggragate benchmark and compare the agreement to other benchmarks.
|
233 |
+
We recommend to perform this comparison using the [Z score](https://en.wikipedia.org/wiki/Standard_score) and demonstrate obtaining it to a benchmark of your selection.
|
234 |
+
In the follwing way: $z_i=(x_i-\mu_{i...N}) / \sigma_{i...N}$ where $x_i$ is the agreement of the $i$th benchmark to the aggragate and $\mu_{i...N}$,$\sigma_{i...N}$ are the
|
235 |
+
mean and standard deviation of the agreements of the other benchmarks to the aggragate.
|
236 |
+
""")
|
237 |
+
|
238 |
+
|
239 |
+
benchmarks = data["Benchmark"].unique().tolist()
|
240 |
+
plotted_scenario = st.selectbox(
|
241 |
+
"Choose Benchmark to plot", benchmarks, index=benchmarks.index("LMSys Arena")
|
242 |
+
)
|
243 |
+
|
244 |
+
|
245 |
+
fig = px.histogram(
|
246 |
+
data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
|
247 |
+
)
|
248 |
+
# Add a vertical line at a specific x-coordinate
|
249 |
+
# Replace 'x_value' with the actual value where you want the line
|
250 |
+
x_value = 0.5 # Example value, adjust as necessary
|
251 |
+
fig.add_vline(
|
252 |
+
x=data.query("Benchmark==@plotted_scenario")[corr_name].iloc[0],
|
253 |
+
line_dash="dash",
|
254 |
+
line_color="red",
|
255 |
+
)
|
256 |
+
# Update layout to add a title
|
257 |
+
fig.update_layout(
|
258 |
+
title="Histogram of Correlation Values", # Change the title text as needed
|
259 |
+
title_x=0.3, # Centers the title
|
260 |
+
title_font=dict(size=20, family="CMU"), # Customize font if needed
|
261 |
+
)
|
262 |
+
|
263 |
+
# # Plot!
|
264 |
+
st.plotly_chart(fig, use_container_width=True)
|
265 |
+
|
266 |
+
st.subheader("Why should you use the BenchBench Leaderboard?")
|
267 |
+
|
268 |
+
st.markdown(
|
269 |
+
"""
|
270 |
+
|
271 |
+
Current practices in Benchmark Agreement Testing (BAT) often suffer from a lack of standardization and transparency, which can lead to inconsistent results and diminished trust in benchmark evaluations. Several key issues are prevalent in the field:
|
272 |
+
|
273 |
+
"""
|
274 |
+
)
|
275 |
+
|
276 |
+
st.markdown(
|
277 |
+
"""
|
278 |
+
- **Lack of Standard Methodologies:** Unlike other scientific procedures that follow rigorous methodologies, BAT lacks uniform procedures across different studies. Researchers often employ varied criteria for selecting benchmarks and models for comparison, which leads to results that cannot be easily compared or replicated. This variation undermines the reliability of conclusions drawn from BAT and makes it difficult for other researchers to build on existing work.
|
279 |
+
"""
|
280 |
+
)
|
281 |
+
|
282 |
+
st.image(
|
283 |
+
"assets/motivation.png",
|
284 |
+
caption="Conclusions depend on the models considered. Kendall-tau correlations between the LMSys Arena benchmark and three other benchmarks: BBH, MMLU, and Alpaca v2. Each group of bars represents the correlation for different sets of top models, specifically the top 5, top 10, and top 15 (overlapping) models (according to the Arena). The results indicate that the degree of agreement between benchmarks varies with the number of top models considered, highlighting that different selections of models can lead to varying conclusions about benchmark agreement.",
|
285 |
+
use_column_width=True,
|
286 |
+
)
|
287 |
+
|
288 |
+
st.markdown(
|
289 |
+
"""
|
290 |
+
- **Arbitrary Selection of Reference Benchmarks:** One of the most critical decisions in BAT is the choice of reference benchmarks. Currently, this choice is often arbitrary and lacks a clear rationale, influenced by availability or personal preference rather than strategic alignment with the benchmark’s purpose. This can skew the results significantly, as different benchmarks may not be equally representative or relevant to the models being tested.
|
291 |
+
"""
|
292 |
+
)
|
293 |
+
st.markdown(
|
294 |
+
"""
|
295 |
+
- **Inadequate Model Representation:** BAT frequently relies on a limited subset of models, which may not comprehensively represent the diversity of architectures and training paradigms in modern language models. This selective representation can lead to biased agreement scores that favor certain types of models over others, failing to provide a holistic view of model performance across different benchmarks.
|
296 |
+
"""
|
297 |
+
)
|
298 |
|
299 |
+
st.image(
|
300 |
+
"assets/pointplot_granularity_matters.png",
|
301 |
+
caption="Correlations increase with number of models. Mean correlation (y) between each benchmark (lines) and the rest, given different numbers of models. The Blue and Orange lines are the average of all benchmark pair correlations with models sampled randomly (orange) or in contiguous sets (blue). The shaded lines represents adjacent sampling for the different benchmarks.",
|
302 |
+
use_column_width=True,
|
303 |
+
)
|
304 |
+
|
305 |
+
st.markdown(
|
306 |
+
"""
|
307 |
+
- **Overemphasis on Correlation Metrics:** Current BAT practices tend to over-rely on correlation metrics without adequately considering their limitations and the context of their application. While these metrics can provide useful insights, they are often treated as definitive evidence of agreement without acknowledging that high correlation does not necessarily imply conceptual alignment between benchmarks.
|
308 |
+
"""
|
309 |
+
)
|
310 |
+
|
311 |
+
st.markdown(
|
312 |
+
"""
|
313 |
+
To address these issues, there is a critical need for a more structured approach to BAT that includes clear guidelines for benchmark and model selection, a broader consideration of agreement metrics, and an acknowledgment of the evolving nature of technology in this space. By reforming BAT practices, the research community can improve the reliability and utility of benchmarks as tools for evaluating and advancing language models.
|
314 |
+
"""
|
315 |
+
)
|
316 |
+
|
317 |
+
|
318 |
+
st.image(
|
319 |
+
"assets/ablations.png",
|
320 |
+
caption="Our recommendations substantially reduce the variance of BAT. Ablation analysis for each BAT recommendation separately and their combinations.",
|
321 |
+
use_column_width=True,
|
322 |
+
)
|
323 |
+
|
324 |
+
|
325 |
+
st.header("The BenchBench package")
|
326 |
+
|
327 |
+
st.markdown("""
|
328 |
+
### Overview
|
329 |
+
|
330 |
+
The BAT package is designed to facilitate benchmark agreement testing for NLP models. It allows users to easily compare multiple models against various benchmarks and generate comprehensive reports on their agreement.
|
331 |
+
|
332 |
+
### Installation
|
333 |
|
334 |
+
To install the BAT package, you can use pip:
|
335 |
+
|
336 |
+
```
|
337 |
+
pip install bat-package
|
338 |
+
```
|
339 |
+
|
340 |
+
### Usage Example
|
341 |
+
|
342 |
+
Below is a step-by-step example of how to use the BAT package to perform agreement testing.
|
343 |
+
|
344 |
+
#### Step 1: Configuration
|
345 |
+
|
346 |
+
First, set up the configuration for the tests:
|
347 |
+
|
348 |
+
```python
|
349 |
import pandas as pd
|
350 |
from bat import Tester, Config, Benchmark, Reporter
|
351 |
from bat.utils import get_holistic_benchmark
|
352 |
|
|
|
353 |
cfg = Config(
|
354 |
exp_to_run="example",
|
355 |
n_models_taken_list=[0],
|
356 |
model_select_strategy_list=["random"],
|
357 |
+
n_exps=10
|
|
|
358 |
)
|
359 |
+
```
|
360 |
|
361 |
+
#### Step 2: Fetch Model Names
|
362 |
|
363 |
+
Fetch the names of the reference models to be used for scoring:
|
|
|
364 |
|
365 |
+
```python
|
366 |
tester = Tester(cfg=cfg)
|
367 |
+
models_for_benchmark_scoring = tester.fetch_reference_models_names(
|
368 |
+
reference_benchmark=get_holistic_benchmark(), n_models=20
|
369 |
+
)
|
370 |
+
print(models_for_benchmark_scoring)
|
371 |
+
```
|
372 |
+
|
373 |
+
#### Step 3: Load and Prepare Benchmark
|
374 |
|
375 |
+
Load a new benchmark and add an aggregate column:
|
|
|
|
|
376 |
|
377 |
+
```python
|
378 |
+
newbench_name = "fakebench"
|
379 |
newbench = Benchmark(
|
380 |
+
pd.read_csv(f"src/bat/assets/{newbench_name}.csv"),
|
381 |
data_source=newbench_name,
|
382 |
)
|
383 |
+
newbench.add_aggregate(new_col_name=f"{newbench_name}_mwr")
|
384 |
+
```
|
385 |
|
386 |
+
#### Step 4: Agreement Testing
|
|
|
387 |
|
388 |
+
Perform all-vs-all agreement testing on the new benchmark:
|
389 |
+
|
390 |
+
```python
|
391 |
+
newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
|
392 |
reporter = Reporter()
|
393 |
+
reporter.draw_agreements(newbench_agreements)
|
394 |
+
```
|
395 |
+
|
396 |
+
#### Step 5: Extend and Clean Benchmark
|
397 |
|
398 |
+
Extend the new benchmark with holistic data and clear repeated scenarios:
|
|
|
399 |
|
400 |
+
```python
|
401 |
+
allbench = newbench.extend(get_holistic_benchmark())
|
402 |
allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
|
403 |
+
```
|
404 |
+
|
405 |
+
#### Step 6: Comprehensive Agreement Testing
|
406 |
|
407 |
+
Perform comprehensive agreement testing and visualize:
|
408 |
|
409 |
+
```python
|
410 |
+
all_agreements = tester.all_vs_all_agreement_testing(allbench)
|
411 |
+
reporter.draw_agreements(all_agreements)
|
412 |
+
```
|
413 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/ablations.png
ADDED
assets/combined_holistic.csv
DELETED
@@ -1,825 +0,0 @@
|
|
1 |
-
,model,score,scenario,source,aggragated_from
|
2 |
-
0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[]
|
3 |
-
1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[]
|
4 |
-
2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[]
|
5 |
-
3,yi-large,63.7,arena-hard,arena_hard_2404,[]
|
6 |
-
4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[]
|
7 |
-
5,glm-4,55.7,arena-hard,arena_hard_2404,[]
|
8 |
-
6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[]
|
9 |
-
7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[]
|
10 |
-
8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[]
|
11 |
-
9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[]
|
12 |
-
10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[]
|
13 |
-
11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[]
|
14 |
-
12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[]
|
15 |
-
13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[]
|
16 |
-
14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[]
|
17 |
-
15,command-r-plus,33.1,arena-hard,arena_hard_2404,[]
|
18 |
-
16,mistral-medium,31.9,arena-hard,arena_hard_2404,[]
|
19 |
-
17,mistral-next,27.4,arena-hard,arena_hard_2404,[]
|
20 |
-
18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[]
|
21 |
-
19,claude-2.0,24.0,arena-hard,arena_hard_2404,[]
|
22 |
-
20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[]
|
23 |
-
21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[]
|
24 |
-
22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[]
|
25 |
-
23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[]
|
26 |
-
24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[]
|
27 |
-
25,claude-2.1,22.8,arena-hard,arena_hard_2404,[]
|
28 |
-
26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[]
|
29 |
-
27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[]
|
30 |
-
28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[]
|
31 |
-
29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[]
|
32 |
-
30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[]
|
33 |
-
31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[]
|
34 |
-
32,command-r,17.0,arena-hard,arena_hard_2404,[]
|
35 |
-
33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[]
|
36 |
-
34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[]
|
37 |
-
35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[]
|
38 |
-
36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[]
|
39 |
-
37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[]
|
40 |
-
38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[]
|
41 |
-
39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[]
|
42 |
-
40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[]
|
43 |
-
41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[]
|
44 |
-
42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[]
|
45 |
-
43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[]
|
46 |
-
0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
47 |
-
1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
48 |
-
2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
49 |
-
3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
50 |
-
4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
51 |
-
5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
52 |
-
6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
53 |
-
7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
54 |
-
8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
55 |
-
9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
56 |
-
10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
57 |
-
11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
58 |
-
12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
59 |
-
13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
60 |
-
14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
61 |
-
15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
62 |
-
16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
63 |
-
17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
64 |
-
18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
65 |
-
19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
66 |
-
20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
67 |
-
21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
68 |
-
22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
69 |
-
23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
70 |
-
24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
71 |
-
25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
72 |
-
26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
73 |
-
27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
74 |
-
28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
75 |
-
29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
76 |
-
30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
77 |
-
31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
78 |
-
32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
79 |
-
33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
80 |
-
34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
81 |
-
35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
82 |
-
36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
83 |
-
37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
84 |
-
38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
85 |
-
39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
86 |
-
40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
87 |
-
41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
88 |
-
42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
89 |
-
43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
90 |
-
44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
91 |
-
45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
92 |
-
46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
93 |
-
47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
94 |
-
48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
95 |
-
49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
96 |
-
50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
97 |
-
51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
98 |
-
52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
99 |
-
53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
100 |
-
54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
101 |
-
55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
102 |
-
56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
103 |
-
57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
104 |
-
58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
105 |
-
59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
106 |
-
60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
107 |
-
61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
108 |
-
62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
109 |
-
63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
110 |
-
64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
111 |
-
65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
112 |
-
66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
113 |
-
67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
114 |
-
68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
115 |
-
69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
116 |
-
70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
117 |
-
71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
118 |
-
72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
119 |
-
73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
120 |
-
74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
121 |
-
75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
122 |
-
76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
123 |
-
77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
124 |
-
78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
125 |
-
79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
126 |
-
80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
127 |
-
81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
128 |
-
82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
129 |
-
83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
130 |
-
84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
131 |
-
85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
132 |
-
86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
133 |
-
87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
134 |
-
88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
135 |
-
89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
136 |
-
90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
137 |
-
91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
138 |
-
92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
139 |
-
93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
140 |
-
94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
141 |
-
95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
142 |
-
96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
143 |
-
97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
144 |
-
98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
145 |
-
99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
146 |
-
100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
147 |
-
101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
148 |
-
102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
149 |
-
103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
150 |
-
104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
151 |
-
105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
152 |
-
106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
153 |
-
107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
154 |
-
108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
155 |
-
109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
156 |
-
110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
157 |
-
111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
158 |
-
112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
159 |
-
113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
160 |
-
114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
161 |
-
115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
162 |
-
116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
163 |
-
117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
164 |
-
118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
165 |
-
119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
166 |
-
120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
167 |
-
121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
168 |
-
122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
169 |
-
123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
170 |
-
124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
171 |
-
125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
172 |
-
126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
173 |
-
127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
174 |
-
128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
175 |
-
129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
176 |
-
130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
177 |
-
131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
178 |
-
264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[]
|
179 |
-
265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[]
|
180 |
-
266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[]
|
181 |
-
267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[]
|
182 |
-
268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[]
|
183 |
-
269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[]
|
184 |
-
270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[]
|
185 |
-
271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[]
|
186 |
-
272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[]
|
187 |
-
273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[]
|
188 |
-
274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[]
|
189 |
-
275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[]
|
190 |
-
276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[]
|
191 |
-
277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[]
|
192 |
-
278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[]
|
193 |
-
279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[]
|
194 |
-
280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[]
|
195 |
-
281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[]
|
196 |
-
282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[]
|
197 |
-
283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[]
|
198 |
-
284,command-r,75.0,mmlu-mixed,mixeval_240601,[]
|
199 |
-
285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[]
|
200 |
-
286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[]
|
201 |
-
287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[]
|
202 |
-
288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[]
|
203 |
-
289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[]
|
204 |
-
290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[]
|
205 |
-
291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[]
|
206 |
-
292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[]
|
207 |
-
293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[]
|
208 |
-
294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[]
|
209 |
-
295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[]
|
210 |
-
296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[]
|
211 |
-
297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[]
|
212 |
-
298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[]
|
213 |
-
299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[]
|
214 |
-
300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[]
|
215 |
-
301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[]
|
216 |
-
302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[]
|
217 |
-
303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[]
|
218 |
-
304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[]
|
219 |
-
305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[]
|
220 |
-
306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[]
|
221 |
-
307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[]
|
222 |
-
308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[]
|
223 |
-
309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[]
|
224 |
-
310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[]
|
225 |
-
311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[]
|
226 |
-
312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[]
|
227 |
-
313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[]
|
228 |
-
314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[]
|
229 |
-
315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[]
|
230 |
-
316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[]
|
231 |
-
317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[]
|
232 |
-
318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[]
|
233 |
-
319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[]
|
234 |
-
320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[]
|
235 |
-
321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[]
|
236 |
-
322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[]
|
237 |
-
323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[]
|
238 |
-
324,phi-2,62.5,mmlu-mixed,mixeval_240601,[]
|
239 |
-
325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[]
|
240 |
-
326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[]
|
241 |
-
327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[]
|
242 |
-
328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[]
|
243 |
-
329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[]
|
244 |
-
594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[]
|
245 |
-
595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[]
|
246 |
-
596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[]
|
247 |
-
597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[]
|
248 |
-
598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[]
|
249 |
-
599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[]
|
250 |
-
600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[]
|
251 |
-
601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[]
|
252 |
-
602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[]
|
253 |
-
603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[]
|
254 |
-
604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[]
|
255 |
-
605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[]
|
256 |
-
606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[]
|
257 |
-
607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
258 |
-
608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[]
|
259 |
-
609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[]
|
260 |
-
610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[]
|
261 |
-
611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[]
|
262 |
-
612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[]
|
263 |
-
613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[]
|
264 |
-
614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[]
|
265 |
-
615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
|
266 |
-
616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[]
|
267 |
-
617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
268 |
-
618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
|
269 |
-
619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
270 |
-
620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[]
|
271 |
-
621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[]
|
272 |
-
622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[]
|
273 |
-
623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[]
|
274 |
-
624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[]
|
275 |
-
625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[]
|
276 |
-
626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[]
|
277 |
-
627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
278 |
-
628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[]
|
279 |
-
629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[]
|
280 |
-
630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[]
|
281 |
-
631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
|
282 |
-
632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
|
283 |
-
633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[]
|
284 |
-
634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[]
|
285 |
-
635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
286 |
-
636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[]
|
287 |
-
637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[]
|
288 |
-
638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[]
|
289 |
-
639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[]
|
290 |
-
640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
291 |
-
641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
292 |
-
642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
293 |
-
643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[]
|
294 |
-
644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
295 |
-
645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
296 |
-
646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[]
|
297 |
-
647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[]
|
298 |
-
648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[]
|
299 |
-
649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[]
|
300 |
-
650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
301 |
-
651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[]
|
302 |
-
652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
303 |
-
653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[]
|
304 |
-
654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
305 |
-
655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
306 |
-
656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[]
|
307 |
-
657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
308 |
-
658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[]
|
309 |
-
659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[]
|
310 |
-
593,gpt-4-0314,0.57,agieval,BLZ_240312,[]
|
311 |
-
594,gpt-4-0613,0.57,agieval,BLZ_240312,[]
|
312 |
-
596,claude-1,0.49700000000000005,agieval,BLZ_240312,[]
|
313 |
-
601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[]
|
314 |
-
602,yi-34b-chat,0.508,agieval,BLZ_240312,[]
|
315 |
-
605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[]
|
316 |
-
608,vicuna-33b,0.373,agieval,BLZ_240312,[]
|
317 |
-
609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[]
|
318 |
-
611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[]
|
319 |
-
613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[]
|
320 |
-
614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[]
|
321 |
-
617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[]
|
322 |
-
618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[]
|
323 |
-
620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[]
|
324 |
-
623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[]
|
325 |
-
624,vicuna-13b,0.368,agieval,BLZ_240312,[]
|
326 |
-
626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[]
|
327 |
-
627,qwen-14b-chat,0.396,agieval,BLZ_240312,[]
|
328 |
-
630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[]
|
329 |
-
632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[]
|
330 |
-
634,vicuna-7b,0.314,agieval,BLZ_240312,[]
|
331 |
-
636,chatglm3-6b,0.414,agieval,BLZ_240312,[]
|
332 |
-
643,chatglm-6b,0.325,agieval,BLZ_240312,[]
|
333 |
-
647,llama-13b,0.205,agieval,BLZ_240312,[]
|
334 |
-
886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[]
|
335 |
-
888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[]
|
336 |
-
889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[]
|
337 |
-
890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[]
|
338 |
-
891,claude-1,0.8839,alpacav1,BLZ_240312,[]
|
339 |
-
892,claude-2.0,0.9136,alpacav1,BLZ_240312,[]
|
340 |
-
893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[]
|
341 |
-
894,claude-2.1,0.8708,alpacav1,BLZ_240312,[]
|
342 |
-
895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[]
|
343 |
-
896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[]
|
344 |
-
897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[]
|
345 |
-
898,gemini-pro,0.7966,alpacav1,BLZ_240312,[]
|
346 |
-
900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[]
|
347 |
-
902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[]
|
348 |
-
903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[]
|
349 |
-
904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
|
350 |
-
906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[]
|
351 |
-
909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[]
|
352 |
-
911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[]
|
353 |
-
914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[]
|
354 |
-
915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[]
|
355 |
-
918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[]
|
356 |
-
921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[]
|
357 |
-
924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[]
|
358 |
-
925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[]
|
359 |
-
934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[]
|
360 |
-
937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[]
|
361 |
-
827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[]
|
362 |
-
829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[]
|
363 |
-
830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[]
|
364 |
-
831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[]
|
365 |
-
832,claude-1,0.17,alpacav2,BLZ_240312,[]
|
366 |
-
833,claude-2.0,0.172,alpacav2,BLZ_240312,[]
|
367 |
-
834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[]
|
368 |
-
835,claude-2.1,0.157,alpacav2,BLZ_240312,[]
|
369 |
-
836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[]
|
370 |
-
837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[]
|
371 |
-
838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[]
|
372 |
-
839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[]
|
373 |
-
840,claude-instant-1,0.161,alpacav2,BLZ_240312,[]
|
374 |
-
841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[]
|
375 |
-
842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
|
376 |
-
843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[]
|
377 |
-
844,vicuna-33b,0.127,alpacav2,BLZ_240312,[]
|
378 |
-
845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[]
|
379 |
-
846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[]
|
380 |
-
847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[]
|
381 |
-
849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[]
|
382 |
-
852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[]
|
383 |
-
854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[]
|
384 |
-
855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[]
|
385 |
-
856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[]
|
386 |
-
859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[]
|
387 |
-
860,vicuna-13b,0.067,alpacav2,BLZ_240312,[]
|
388 |
-
862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[]
|
389 |
-
863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[]
|
390 |
-
865,guanaco-33b,0.05,alpacav2,BLZ_240312,[]
|
391 |
-
866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[]
|
392 |
-
870,vicuna-7b,0.048,alpacav2,BLZ_240312,[]
|
393 |
-
875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[]
|
394 |
-
878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[]
|
395 |
-
1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[]
|
396 |
-
1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
397 |
-
1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
398 |
-
1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[]
|
399 |
-
1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[]
|
400 |
-
1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
401 |
-
1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[]
|
402 |
-
1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[]
|
403 |
-
1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[]
|
404 |
-
1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[]
|
405 |
-
1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[]
|
406 |
-
1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[]
|
407 |
-
1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[]
|
408 |
-
1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[]
|
409 |
-
1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[]
|
410 |
-
1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[]
|
411 |
-
1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[]
|
412 |
-
1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[]
|
413 |
-
1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[]
|
414 |
-
1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[]
|
415 |
-
1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[]
|
416 |
-
1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[]
|
417 |
-
1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[]
|
418 |
-
1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[]
|
419 |
-
1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[]
|
420 |
-
1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[]
|
421 |
-
1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[]
|
422 |
-
1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[]
|
423 |
-
1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[]
|
424 |
-
1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[]
|
425 |
-
0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[]
|
426 |
-
1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[]
|
427 |
-
2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[]
|
428 |
-
3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[]
|
429 |
-
4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[]
|
430 |
-
5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[]
|
431 |
-
6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[]
|
432 |
-
7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[]
|
433 |
-
8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[]
|
434 |
-
9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[]
|
435 |
-
10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[]
|
436 |
-
11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[]
|
437 |
-
12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[]
|
438 |
-
13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[]
|
439 |
-
14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[]
|
440 |
-
15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[]
|
441 |
-
16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[]
|
442 |
-
17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[]
|
443 |
-
18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[]
|
444 |
-
19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[]
|
445 |
-
20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
|
446 |
-
21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
|
447 |
-
22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[]
|
448 |
-
23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[]
|
449 |
-
24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[]
|
450 |
-
25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[]
|
451 |
-
26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[]
|
452 |
-
27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[]
|
453 |
-
28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[]
|
454 |
-
29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[]
|
455 |
-
30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[]
|
456 |
-
31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[]
|
457 |
-
32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[]
|
458 |
-
33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[]
|
459 |
-
34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[]
|
460 |
-
35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[]
|
461 |
-
36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[]
|
462 |
-
37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[]
|
463 |
-
38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[]
|
464 |
-
39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[]
|
465 |
-
40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[]
|
466 |
-
41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[]
|
467 |
-
42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[]
|
468 |
-
43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[]
|
469 |
-
44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[]
|
470 |
-
45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[]
|
471 |
-
46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[]
|
472 |
-
47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[]
|
473 |
-
48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[]
|
474 |
-
49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[]
|
475 |
-
50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[]
|
476 |
-
51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[]
|
477 |
-
52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[]
|
478 |
-
53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[]
|
479 |
-
54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[]
|
480 |
-
55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[]
|
481 |
-
56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[]
|
482 |
-
57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[]
|
483 |
-
542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[]
|
484 |
-
543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[]
|
485 |
-
550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[]
|
486 |
-
554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[]
|
487 |
-
555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
|
488 |
-
558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[]
|
489 |
-
559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[]
|
490 |
-
561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[]
|
491 |
-
565,vicuna-13b,0.631,gpt4all,BLZ_240312,[]
|
492 |
-
567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
|
493 |
-
573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[]
|
494 |
-
575,vicuna-7b,0.61,gpt4all,BLZ_240312,[]
|
495 |
-
576,koala-13b,0.62,gpt4all,BLZ_240312,[]
|
496 |
-
578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[]
|
497 |
-
579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[]
|
498 |
-
583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[]
|
499 |
-
585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[]
|
500 |
-
586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[]
|
501 |
-
588,llama-13b,0.63,gpt4all,BLZ_240312,[]
|
502 |
-
129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[]
|
503 |
-
130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[]
|
504 |
-
134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[]
|
505 |
-
135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[]
|
506 |
-
136,vicuna-33b,0.585,hugging-6,BLZ_240312,[]
|
507 |
-
137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[]
|
508 |
-
139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[]
|
509 |
-
141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[]
|
510 |
-
142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[]
|
511 |
-
145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[]
|
512 |
-
146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[]
|
513 |
-
147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[]
|
514 |
-
148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[]
|
515 |
-
149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[]
|
516 |
-
150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[]
|
517 |
-
151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[]
|
518 |
-
152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[]
|
519 |
-
154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[]
|
520 |
-
156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[]
|
521 |
-
158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[]
|
522 |
-
160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[]
|
523 |
-
162,vicuna-7b,0.521,hugging-6,BLZ_240312,[]
|
524 |
-
176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[]
|
525 |
-
947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[]
|
526 |
-
948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[]
|
527 |
-
950,claude-1,0.66,llmonitor,BLZ_240312,[]
|
528 |
-
951,claude-2.0,0.68,llmonitor,BLZ_240312,[]
|
529 |
-
954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[]
|
530 |
-
958,claude-instant-1,0.6,llmonitor,BLZ_240312,[]
|
531 |
-
959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[]
|
532 |
-
965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[]
|
533 |
-
975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[]
|
534 |
-
976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[]
|
535 |
-
977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[]
|
536 |
-
978,vicuna-13b,0.5,llmonitor,BLZ_240312,[]
|
537 |
-
982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[]
|
538 |
-
983,guanaco-33b,0.43,llmonitor,BLZ_240312,[]
|
539 |
-
984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[]
|
540 |
-
986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[]
|
541 |
-
987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[]
|
542 |
-
988,vicuna-7b,0.41,llmonitor,BLZ_240312,[]
|
543 |
-
989,koala-13b,0.31,llmonitor,BLZ_240312,[]
|
544 |
-
992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[]
|
545 |
-
1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[]
|
546 |
-
59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[]
|
547 |
-
60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[]
|
548 |
-
62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[]
|
549 |
-
63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[]
|
550 |
-
64,mistral-medium,0.0861,mt-bench,BLZ_240312,[]
|
551 |
-
65,claude-1,0.079,mt-bench,BLZ_240312,[]
|
552 |
-
66,claude-2.0,0.0806,mt-bench,BLZ_240312,[]
|
553 |
-
67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[]
|
554 |
-
68,claude-2.1,0.0818,mt-bench,BLZ_240312,[]
|
555 |
-
69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[]
|
556 |
-
70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[]
|
557 |
-
71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[]
|
558 |
-
72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[]
|
559 |
-
73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[]
|
560 |
-
74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[]
|
561 |
-
75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[]
|
562 |
-
76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[]
|
563 |
-
77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[]
|
564 |
-
78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[]
|
565 |
-
79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[]
|
566 |
-
80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[]
|
567 |
-
81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[]
|
568 |
-
82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[]
|
569 |
-
83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[]
|
570 |
-
84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[]
|
571 |
-
85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[]
|
572 |
-
86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[]
|
573 |
-
88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[]
|
574 |
-
89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[]
|
575 |
-
90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[]
|
576 |
-
92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[]
|
577 |
-
93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[]
|
578 |
-
95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[]
|
579 |
-
96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[]
|
580 |
-
98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[]
|
581 |
-
99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[]
|
582 |
-
101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[]
|
583 |
-
102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[]
|
584 |
-
103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[]
|
585 |
-
104,koala-13b,0.0535,mt-bench,BLZ_240312,[]
|
586 |
-
106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[]
|
587 |
-
107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[]
|
588 |
-
108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[]
|
589 |
-
109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[]
|
590 |
-
110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[]
|
591 |
-
111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[]
|
592 |
-
112,chatglm-6b,0.045,mt-bench,BLZ_240312,[]
|
593 |
-
113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[]
|
594 |
-
114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[]
|
595 |
-
115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[]
|
596 |
-
116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[]
|
597 |
-
0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
598 |
-
1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
599 |
-
2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
600 |
-
3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
601 |
-
4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
602 |
-
5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
603 |
-
6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
604 |
-
7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
605 |
-
8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
606 |
-
9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
607 |
-
10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
608 |
-
11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
609 |
-
12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
610 |
-
13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
611 |
-
14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
612 |
-
15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
613 |
-
16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
614 |
-
17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
615 |
-
18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
616 |
-
19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
617 |
-
20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
618 |
-
21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
619 |
-
22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
620 |
-
23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
621 |
-
24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
622 |
-
25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
623 |
-
26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
624 |
-
27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
625 |
-
28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
626 |
-
29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
627 |
-
30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
628 |
-
31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
629 |
-
32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
630 |
-
33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
631 |
-
34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
632 |
-
35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
633 |
-
36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
634 |
-
37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
635 |
-
38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
636 |
-
39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
637 |
-
40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
638 |
-
41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
639 |
-
42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
640 |
-
43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
641 |
-
44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
642 |
-
45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
643 |
-
46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
644 |
-
47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
645 |
-
0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
646 |
-
1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
647 |
-
2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
648 |
-
3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
649 |
-
4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
650 |
-
5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
651 |
-
6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
652 |
-
7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
653 |
-
8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
654 |
-
9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
655 |
-
10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
656 |
-
11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
657 |
-
12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
658 |
-
13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
659 |
-
14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
660 |
-
15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
661 |
-
16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
662 |
-
17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
663 |
-
18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
664 |
-
19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
665 |
-
20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
666 |
-
21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
667 |
-
22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
668 |
-
23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
669 |
-
24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
670 |
-
25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
671 |
-
26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
672 |
-
27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
673 |
-
28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
674 |
-
29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
675 |
-
30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
676 |
-
31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
677 |
-
32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
678 |
-
33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
679 |
-
34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
680 |
-
35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
681 |
-
36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
682 |
-
37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
683 |
-
38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
684 |
-
39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
685 |
-
40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
686 |
-
41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
687 |
-
42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
688 |
-
43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
689 |
-
44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
690 |
-
45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
691 |
-
46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
692 |
-
47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
693 |
-
48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
694 |
-
49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
695 |
-
50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
696 |
-
51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
697 |
-
52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
698 |
-
53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
699 |
-
54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
700 |
-
55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
701 |
-
56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
702 |
-
57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
703 |
-
58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
704 |
-
59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
705 |
-
60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
706 |
-
61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
707 |
-
62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
708 |
-
63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
709 |
-
64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
710 |
-
65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
711 |
-
66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
712 |
-
67,llama-2-70b,0.582,mmlu,helm_classic_240130,[]
|
713 |
-
68,llama-65b,0.584,mmlu,helm_classic_240130,[]
|
714 |
-
69,text-davinci-002,0.568,mmlu,helm_classic_240130,[]
|
715 |
-
70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[]
|
716 |
-
71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[]
|
717 |
-
72,text-davinci-003,0.569,mmlu,helm_classic_240130,[]
|
718 |
-
73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[]
|
719 |
-
74,llama-2-13b,0.507,mmlu,helm_classic_240130,[]
|
720 |
-
75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[]
|
721 |
-
76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[]
|
722 |
-
77,llama-30b,0.531,mmlu,helm_classic_240130,[]
|
723 |
-
78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[]
|
724 |
-
79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[]
|
725 |
-
80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[]
|
726 |
-
81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[]
|
727 |
-
82,falcon-40b,0.509,mmlu,helm_classic_240130,[]
|
728 |
-
83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[]
|
729 |
-
84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[]
|
730 |
-
85,mpt-30b,0.437,mmlu,helm_classic_240130,[]
|
731 |
-
86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[]
|
732 |
-
87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[]
|
733 |
-
88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[]
|
734 |
-
89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[]
|
735 |
-
90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[]
|
736 |
-
91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[]
|
737 |
-
92,opt-175b,0.318,mmlu,helm_classic_240130,[]
|
738 |
-
93,llama-2-7b,0.431,mmlu,helm_classic_240130,[]
|
739 |
-
94,llama-13b,0.422,mmlu,helm_classic_240130,[]
|
740 |
-
95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[]
|
741 |
-
96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[]
|
742 |
-
97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[]
|
743 |
-
98,davinci-175b,0.422,mmlu,helm_classic_240130,[]
|
744 |
-
99,llama-7b,0.321,mmlu,helm_classic_240130,[]
|
745 |
-
100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[]
|
746 |
-
101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[]
|
747 |
-
102,glm-130b,0.344,mmlu,helm_classic_240130,[]
|
748 |
-
103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[]
|
749 |
-
104,opt-66b,0.276,mmlu,helm_classic_240130,[]
|
750 |
-
105,bloom-176b,0.299,mmlu,helm_classic_240130,[]
|
751 |
-
106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[]
|
752 |
-
107,alpaca-7b,0.385,mmlu,helm_classic_240130,[]
|
753 |
-
108,falcon-7b,0.286,mmlu,helm_classic_240130,[]
|
754 |
-
109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[]
|
755 |
-
110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[]
|
756 |
-
111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[]
|
757 |
-
112,text-curie-001,0.237,mmlu,helm_classic_240130,[]
|
758 |
-
113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[]
|
759 |
-
114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[]
|
760 |
-
115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[]
|
761 |
-
116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[]
|
762 |
-
117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[]
|
763 |
-
118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[]
|
764 |
-
119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[]
|
765 |
-
120,pythia-12b,0.274,mmlu,helm_classic_240130,[]
|
766 |
-
121,curie-6.7b,0.243,mmlu,helm_classic_240130,[]
|
767 |
-
122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[]
|
768 |
-
123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[]
|
769 |
-
124,text-babbage-001,0.229,mmlu,helm_classic_240130,[]
|
770 |
-
125,t0pp-11b,0.407,mmlu,helm_classic_240130,[]
|
771 |
-
126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[]
|
772 |
-
127,ul2-20b,0.291,mmlu,helm_classic_240130,[]
|
773 |
-
128,t5-11b,0.29,mmlu,helm_classic_240130,[]
|
774 |
-
129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[]
|
775 |
-
130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[]
|
776 |
-
131,ada-350m,0.243,mmlu,helm_classic_240130,[]
|
777 |
-
132,text-ada-001,0.238,mmlu,helm_classic_240130,[]
|
778 |
-
133,yalm-100b,0.243,mmlu,helm_classic_240130,[]
|
779 |
-
0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[]
|
780 |
-
1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[]
|
781 |
-
2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[]
|
782 |
-
3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[]
|
783 |
-
4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[]
|
784 |
-
5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[]
|
785 |
-
6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[]
|
786 |
-
7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[]
|
787 |
-
8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[]
|
788 |
-
9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[]
|
789 |
-
10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[]
|
790 |
-
11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[]
|
791 |
-
12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[]
|
792 |
-
13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[]
|
793 |
-
14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[]
|
794 |
-
15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[]
|
795 |
-
16,command-r,-16.0,wildbench-mix,wildbench_240612,[]
|
796 |
-
17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[]
|
797 |
-
18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[]
|
798 |
-
19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[]
|
799 |
-
20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[]
|
800 |
-
21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[]
|
801 |
-
22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[]
|
802 |
-
23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[]
|
803 |
-
24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[]
|
804 |
-
25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[]
|
805 |
-
26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[]
|
806 |
-
27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[]
|
807 |
-
28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[]
|
808 |
-
13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[]
|
809 |
-
30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
810 |
-
41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[]
|
811 |
-
50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[]
|
812 |
-
60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[]
|
813 |
-
70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[]
|
814 |
-
81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[]
|
815 |
-
92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[]
|
816 |
-
103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[]
|
817 |
-
112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[]
|
818 |
-
121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[]
|
819 |
-
132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
820 |
-
143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[]
|
821 |
-
153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
822 |
-
162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[]
|
823 |
-
172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[]
|
824 |
-
182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
825 |
-
192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/livebench.csv
DELETED
@@ -1,365 +0,0 @@
|
|
1 |
-
,model,scenario,score,aggragated_from,source
|
2 |
-
0,claude_3_5_sonnet_20240620,livebench_lb,61.16,[],livebench_240701
|
3 |
-
1,gpt_4o_2024_05_13,livebench_lb,54.96,[],livebench_240701
|
4 |
-
2,gpt_4_turbo_2024_04_09,livebench_lb,53.0,[],livebench_240701
|
5 |
-
3,gpt_4_1106_preview,livebench_lb,52.17,[],livebench_240701
|
6 |
-
4,claude_3_opus_20240229,livebench_lb,50.75,[],livebench_240701
|
7 |
-
5,gpt_4_0125_preview,livebench_lb,49.39,[],livebench_240701
|
8 |
-
6,deepseek_coder_v2,livebench_lb,46.79,[],livebench_240701
|
9 |
-
7,gemini_1.5_pro_api_0514,livebench_lb,44.35,[],livebench_240701
|
10 |
-
8,gemma_2_27b_it,livebench_lb,41.22,[],livebench_240701
|
11 |
-
9,gemini_1.5_flash_api_0514,livebench_lb,40.89,[],livebench_240701
|
12 |
-
10,qwen2_72b_instruct,livebench_lb,40.16,[],livebench_240701
|
13 |
-
11,acm_rewrite_qwen2_72b_chat,livebench_lb,39.6,[],livebench_240701
|
14 |
-
12,mistral_large_2402,livebench_lb,38.92,[],livebench_240701
|
15 |
-
13,deepseek_chat_v2,livebench_lb,38.39,[],livebench_240701
|
16 |
-
14,claude_3_sonnet_20240229,livebench_lb,38.08,[],livebench_240701
|
17 |
-
15,meta_llama_3_70b_instruct,livebench_lb,37.38,[],livebench_240701
|
18 |
-
16,claude_3_haiku_20240307,livebench_lb,35.32,[],livebench_240701
|
19 |
-
17,mixtral_8x22b_instruct_v0.1,livebench_lb,34.84,[],livebench_240701
|
20 |
-
18,gpt_3.5_turbo_0125,livebench_lb,34.43,[],livebench_240701
|
21 |
-
19,gpt_3.5_turbo_1106,livebench_lb,34.14,[],livebench_240701
|
22 |
-
20,command_r_plus,livebench_lb,32.86,[],livebench_240701
|
23 |
-
21,mistral_small_2402,livebench_lb,32.8,[],livebench_240701
|
24 |
-
22,gemma_2_9b_it,livebench_lb,31.57,[],livebench_240701
|
25 |
-
23,phi_3_medium_4k_instruct,livebench_lb,30.33,[],livebench_240701
|
26 |
-
24,phi_3_medium_128k_instruct,livebench_lb,29.64,[],livebench_240701
|
27 |
-
25,deepseek_coder_v2_lite_instruct,livebench_lb,29.15,[],livebench_240701
|
28 |
-
26,qwen1.5_110b_chat,livebench_lb,28.96,[],livebench_240701
|
29 |
-
27,qwen1.5_72b_chat,livebench_lb,28.89,[],livebench_240701
|
30 |
-
28,command_r,livebench_lb,27.23,[],livebench_240701
|
31 |
-
29,phi_3_small_128k_instruct,livebench_lb,27.19,[],livebench_240701
|
32 |
-
30,meta_llama_3_8b_instruct,livebench_lb,26.67,[],livebench_240701
|
33 |
-
31,qwen2_7b_instruct,livebench_lb,26.45,[],livebench_240701
|
34 |
-
32,phi_3_small_8k_instruct,livebench_lb,26.24,[],livebench_240701
|
35 |
-
33,openhermes_2.5_mistral_7b,livebench_lb,23.3,[],livebench_240701
|
36 |
-
34,mixtral_8x7b_instruct_v0.1,livebench_lb,22.5,[],livebench_240701
|
37 |
-
35,mistral_7b_instruct_v0.2,livebench_lb,19.33,[],livebench_240701
|
38 |
-
36,phi_3_mini_4k_instruct,livebench_lb,19.27,[],livebench_240701
|
39 |
-
37,zephyr_7b_alpha,livebench_lb,19.22,[],livebench_240701
|
40 |
-
38,phi_3_mini_128k_instruct,livebench_lb,18.04,[],livebench_240701
|
41 |
-
39,zephyr_7b_beta,livebench_lb,17.32,[],livebench_240701
|
42 |
-
40,deepseek_v2_lite_chat,livebench_lb,17.14,[],livebench_240701
|
43 |
-
41,qwen1.5_7b_chat,livebench_lb,16.5,[],livebench_240701
|
44 |
-
42,starling_lm_7b_beta,livebench_lb,16.44,[],livebench_240701
|
45 |
-
43,vicuna_7b_v1.5_16k,livebench_lb,13.71,[],livebench_240701
|
46 |
-
44,vicuna_7b_v1.5,livebench_lb,11.73,[],livebench_240701
|
47 |
-
45,qwen1.5_4b_chat,livebench_lb,11.13,[],livebench_240701
|
48 |
-
46,llama_2_7b_chat,livebench_lb,10.25,[],livebench_240701
|
49 |
-
47,qwen2_1.5b_instruct,livebench_lb,9.96,[],livebench_240701
|
50 |
-
48,yi_6b_chat,livebench_lb,8.79,[],livebench_240701
|
51 |
-
49,qwen2_0.5b_instruct,livebench_lb,6.78,[],livebench_240701
|
52 |
-
50,qwen1.5_1.8b_chat,livebench_lb,6.09,[],livebench_240701
|
53 |
-
51,qwen1.5_0.5b_chat,livebench_lb,5.26,[],livebench_240701
|
54 |
-
52,claude_3_5_sonnet_20240620,reasoning_lb,64.0,[],livebench_240701
|
55 |
-
53,gpt_4o_2024_05_13,reasoning_lb,55.0,[],livebench_240701
|
56 |
-
54,gpt_4_turbo_2024_04_09,reasoning_lb,54.0,[],livebench_240701
|
57 |
-
55,gpt_4_1106_preview,reasoning_lb,52.0,[],livebench_240701
|
58 |
-
56,claude_3_opus_20240229,reasoning_lb,41.0,[],livebench_240701
|
59 |
-
57,gpt_4_0125_preview,reasoning_lb,48.0,[],livebench_240701
|
60 |
-
58,deepseek_coder_v2,reasoning_lb,49.0,[],livebench_240701
|
61 |
-
59,gemini_1.5_pro_api_0514,reasoning_lb,33.0,[],livebench_240701
|
62 |
-
60,gemma_2_27b_it,reasoning_lb,31.0,[],livebench_240701
|
63 |
-
61,gemini_1.5_flash_api_0514,reasoning_lb,30.0,[],livebench_240701
|
64 |
-
62,qwen2_72b_instruct,reasoning_lb,42.0,[],livebench_240701
|
65 |
-
63,acm_rewrite_qwen2_72b_chat,reasoning_lb,37.0,[],livebench_240701
|
66 |
-
64,mistral_large_2402,reasoning_lb,35.0,[],livebench_240701
|
67 |
-
65,deepseek_chat_v2,reasoning_lb,29.0,[],livebench_240701
|
68 |
-
66,claude_3_sonnet_20240229,reasoning_lb,26.0,[],livebench_240701
|
69 |
-
67,meta_llama_3_70b_instruct,reasoning_lb,31.0,[],livebench_240701
|
70 |
-
68,claude_3_haiku_20240307,reasoning_lb,26.0,[],livebench_240701
|
71 |
-
69,mixtral_8x22b_instruct_v0.1,reasoning_lb,29.0,[],livebench_240701
|
72 |
-
70,gpt_3.5_turbo_0125,reasoning_lb,26.0,[],livebench_240701
|
73 |
-
71,gpt_3.5_turbo_1106,reasoning_lb,28.0,[],livebench_240701
|
74 |
-
72,command_r_plus,reasoning_lb,32.0,[],livebench_240701
|
75 |
-
73,mistral_small_2402,reasoning_lb,28.0,[],livebench_240701
|
76 |
-
74,gemma_2_9b_it,reasoning_lb,19.0,[],livebench_240701
|
77 |
-
75,phi_3_medium_4k_instruct,reasoning_lb,35.0,[],livebench_240701
|
78 |
-
76,phi_3_medium_128k_instruct,reasoning_lb,31.0,[],livebench_240701
|
79 |
-
77,deepseek_coder_v2_lite_instruct,reasoning_lb,22.0,[],livebench_240701
|
80 |
-
78,qwen1.5_110b_chat,reasoning_lb,26.0,[],livebench_240701
|
81 |
-
79,qwen1.5_72b_chat,reasoning_lb,21.0,[],livebench_240701
|
82 |
-
80,command_r,reasoning_lb,28.0,[],livebench_240701
|
83 |
-
81,phi_3_small_128k_instruct,reasoning_lb,36.0,[],livebench_240701
|
84 |
-
82,meta_llama_3_8b_instruct,reasoning_lb,25.0,[],livebench_240701
|
85 |
-
83,qwen2_7b_instruct,reasoning_lb,20.0,[],livebench_240701
|
86 |
-
84,phi_3_small_8k_instruct,reasoning_lb,23.0,[],livebench_240701
|
87 |
-
85,openhermes_2.5_mistral_7b,reasoning_lb,17.0,[],livebench_240701
|
88 |
-
86,mixtral_8x7b_instruct_v0.1,reasoning_lb,18.0,[],livebench_240701
|
89 |
-
87,mistral_7b_instruct_v0.2,reasoning_lb,13.0,[],livebench_240701
|
90 |
-
88,phi_3_mini_4k_instruct,reasoning_lb,19.0,[],livebench_240701
|
91 |
-
89,zephyr_7b_alpha,reasoning_lb,17.0,[],livebench_240701
|
92 |
-
90,phi_3_mini_128k_instruct,reasoning_lb,10.0,[],livebench_240701
|
93 |
-
91,zephyr_7b_beta,reasoning_lb,16.0,[],livebench_240701
|
94 |
-
92,deepseek_v2_lite_chat,reasoning_lb,13.0,[],livebench_240701
|
95 |
-
93,qwen1.5_7b_chat,reasoning_lb,13.0,[],livebench_240701
|
96 |
-
94,starling_lm_7b_beta,reasoning_lb,19.0,[],livebench_240701
|
97 |
-
95,vicuna_7b_v1.5_16k,reasoning_lb,15.0,[],livebench_240701
|
98 |
-
96,vicuna_7b_v1.5,reasoning_lb,12.0,[],livebench_240701
|
99 |
-
97,qwen1.5_4b_chat,reasoning_lb,13.0,[],livebench_240701
|
100 |
-
98,llama_2_7b_chat,reasoning_lb,5.0,[],livebench_240701
|
101 |
-
99,qwen2_1.5b_instruct,reasoning_lb,8.0,[],livebench_240701
|
102 |
-
100,yi_6b_chat,reasoning_lb,8.0,[],livebench_240701
|
103 |
-
101,qwen2_0.5b_instruct,reasoning_lb,3.0,[],livebench_240701
|
104 |
-
102,qwen1.5_1.8b_chat,reasoning_lb,5.0,[],livebench_240701
|
105 |
-
103,qwen1.5_0.5b_chat,reasoning_lb,4.0,[],livebench_240701
|
106 |
-
104,claude_3_5_sonnet_20240620,coding_lb,63.21,[],livebench_240701
|
107 |
-
105,gpt_4o_2024_05_13,coding_lb,46.37,[],livebench_240701
|
108 |
-
106,gpt_4_turbo_2024_04_09,coding_lb,47.05,[],livebench_240701
|
109 |
-
107,gpt_4_1106_preview,coding_lb,44.37,[],livebench_240701
|
110 |
-
108,claude_3_opus_20240229,coding_lb,40.05,[],livebench_240701
|
111 |
-
109,gpt_4_0125_preview,coding_lb,44.05,[],livebench_240701
|
112 |
-
110,deepseek_coder_v2,coding_lb,41.05,[],livebench_240701
|
113 |
-
111,gemini_1.5_pro_api_0514,coding_lb,32.79,[],livebench_240701
|
114 |
-
112,gemma_2_27b_it,coding_lb,36.74,[],livebench_240701
|
115 |
-
113,gemini_1.5_flash_api_0514,coding_lb,39.05,[],livebench_240701
|
116 |
-
114,qwen2_72b_instruct,coding_lb,31.79,[],livebench_240701
|
117 |
-
115,acm_rewrite_qwen2_72b_chat,coding_lb,39.05,[],livebench_240701
|
118 |
-
116,mistral_large_2402,coding_lb,26.84,[],livebench_240701
|
119 |
-
117,deepseek_chat_v2,coding_lb,33.47,[],livebench_240701
|
120 |
-
118,claude_3_sonnet_20240229,coding_lb,25.21,[],livebench_240701
|
121 |
-
119,meta_llama_3_70b_instruct,coding_lb,20.95,[],livebench_240701
|
122 |
-
120,claude_3_haiku_20240307,coding_lb,24.53,[],livebench_240701
|
123 |
-
121,mixtral_8x22b_instruct_v0.1,coding_lb,33.11,[],livebench_240701
|
124 |
-
122,gpt_3.5_turbo_0125,coding_lb,29.16,[],livebench_240701
|
125 |
-
123,gpt_3.5_turbo_1106,coding_lb,26.84,[],livebench_240701
|
126 |
-
124,command_r_plus,coding_lb,20.26,[],livebench_240701
|
127 |
-
125,mistral_small_2402,coding_lb,24.21,[],livebench_240701
|
128 |
-
126,gemma_2_9b_it,coding_lb,22.21,[],livebench_240701
|
129 |
-
127,phi_3_medium_4k_instruct,coding_lb,20.58,[],livebench_240701
|
130 |
-
128,phi_3_medium_128k_instruct,coding_lb,21.58,[],livebench_240701
|
131 |
-
129,deepseek_coder_v2_lite_instruct,coding_lb,26.84,[],livebench_240701
|
132 |
-
130,qwen1.5_110b_chat,coding_lb,22.21,[],livebench_240701
|
133 |
-
131,qwen1.5_72b_chat,coding_lb,22.89,[],livebench_240701
|
134 |
-
132,command_r,coding_lb,14.95,[],livebench_240701
|
135 |
-
133,phi_3_small_128k_instruct,coding_lb,25.84,[],livebench_240701
|
136 |
-
134,meta_llama_3_8b_instruct,coding_lb,18.26,[],livebench_240701
|
137 |
-
135,qwen2_7b_instruct,coding_lb,29.21,[],livebench_240701
|
138 |
-
136,phi_3_small_8k_instruct,coding_lb,19.58,[],livebench_240701
|
139 |
-
137,openhermes_2.5_mistral_7b,coding_lb,11.63,[],livebench_240701
|
140 |
-
138,mixtral_8x7b_instruct_v0.1,coding_lb,11.32,[],livebench_240701
|
141 |
-
139,mistral_7b_instruct_v0.2,coding_lb,11.63,[],livebench_240701
|
142 |
-
140,phi_3_mini_4k_instruct,coding_lb,14.95,[],livebench_240701
|
143 |
-
141,zephyr_7b_alpha,coding_lb,11.32,[],livebench_240701
|
144 |
-
142,phi_3_mini_128k_instruct,coding_lb,11.63,[],livebench_240701
|
145 |
-
143,zephyr_7b_beta,coding_lb,8.32,[],livebench_240701
|
146 |
-
144,deepseek_v2_lite_chat,coding_lb,8.63,[],livebench_240701
|
147 |
-
145,qwen1.5_7b_chat,coding_lb,6.63,[],livebench_240701
|
148 |
-
146,starling_lm_7b_beta,coding_lb,18.26,[],livebench_240701
|
149 |
-
147,vicuna_7b_v1.5_16k,coding_lb,1.32,[],livebench_240701
|
150 |
-
148,vicuna_7b_v1.5,coding_lb,1.0,[],livebench_240701
|
151 |
-
149,qwen1.5_4b_chat,coding_lb,4.0,[],livebench_240701
|
152 |
-
150,llama_2_7b_chat,coding_lb,0.0,[],livebench_240701
|
153 |
-
151,qwen2_1.5b_instruct,coding_lb,5.63,[],livebench_240701
|
154 |
-
152,yi_6b_chat,coding_lb,1.32,[],livebench_240701
|
155 |
-
153,qwen2_0.5b_instruct,coding_lb,2.0,[],livebench_240701
|
156 |
-
154,qwen1.5_1.8b_chat,coding_lb,0.0,[],livebench_240701
|
157 |
-
155,qwen1.5_0.5b_chat,coding_lb,0.0,[],livebench_240701
|
158 |
-
156,claude_3_5_sonnet_20240620,mathematics_lb,53.75,[],livebench_240701
|
159 |
-
157,gpt_4o_2024_05_13,mathematics_lb,49.88,[],livebench_240701
|
160 |
-
158,gpt_4_turbo_2024_04_09,mathematics_lb,48.99,[],livebench_240701
|
161 |
-
159,gpt_4_1106_preview,mathematics_lb,47.55,[],livebench_240701
|
162 |
-
160,claude_3_opus_20240229,mathematics_lb,46.54,[],livebench_240701
|
163 |
-
161,gpt_4_0125_preview,mathematics_lb,42.75,[],livebench_240701
|
164 |
-
162,deepseek_coder_v2,mathematics_lb,52.19,[],livebench_240701
|
165 |
-
163,gemini_1.5_pro_api_0514,mathematics_lb,42.07,[],livebench_240701
|
166 |
-
164,gemma_2_27b_it,mathematics_lb,36.23,[],livebench_240701
|
167 |
-
165,gemini_1.5_flash_api_0514,mathematics_lb,38.54,[],livebench_240701
|
168 |
-
166,qwen2_72b_instruct,mathematics_lb,43.44,[],livebench_240701
|
169 |
-
167,acm_rewrite_qwen2_72b_chat,mathematics_lb,40.32,[],livebench_240701
|
170 |
-
168,mistral_large_2402,mathematics_lb,32.2,[],livebench_240701
|
171 |
-
169,deepseek_chat_v2,mathematics_lb,33.23,[],livebench_240701
|
172 |
-
170,claude_3_sonnet_20240229,mathematics_lb,29.65,[],livebench_240701
|
173 |
-
171,meta_llama_3_70b_instruct,mathematics_lb,32.31,[],livebench_240701
|
174 |
-
172,claude_3_haiku_20240307,mathematics_lb,25.72,[],livebench_240701
|
175 |
-
173,mixtral_8x22b_instruct_v0.1,mathematics_lb,26.94,[],livebench_240701
|
176 |
-
174,gpt_3.5_turbo_0125,mathematics_lb,25.54,[],livebench_240701
|
177 |
-
175,gpt_3.5_turbo_1106,mathematics_lb,28.13,[],livebench_240701
|
178 |
-
176,command_r_plus,mathematics_lb,24.85,[],livebench_240701
|
179 |
-
177,mistral_small_2402,mathematics_lb,26.76,[],livebench_240701
|
180 |
-
178,gemma_2_9b_it,mathematics_lb,23.98,[],livebench_240701
|
181 |
-
179,phi_3_medium_4k_instruct,mathematics_lb,27.54,[],livebench_240701
|
182 |
-
180,phi_3_medium_128k_instruct,mathematics_lb,24.25,[],livebench_240701
|
183 |
-
181,deepseek_coder_v2_lite_instruct,mathematics_lb,34.09,[],livebench_240701
|
184 |
-
182,qwen1.5_110b_chat,mathematics_lb,25.58,[],livebench_240701
|
185 |
-
183,qwen1.5_72b_chat,mathematics_lb,26.82,[],livebench_240701
|
186 |
-
184,command_r,mathematics_lb,16.92,[],livebench_240701
|
187 |
-
185,phi_3_small_128k_instruct,mathematics_lb,24.84,[],livebench_240701
|
188 |
-
186,meta_llama_3_8b_instruct,mathematics_lb,17.58,[],livebench_240701
|
189 |
-
187,qwen2_7b_instruct,mathematics_lb,25.83,[],livebench_240701
|
190 |
-
188,phi_3_small_8k_instruct,mathematics_lb,24.15,[],livebench_240701
|
191 |
-
189,openhermes_2.5_mistral_7b,mathematics_lb,20.1,[],livebench_240701
|
192 |
-
190,mixtral_8x7b_instruct_v0.1,mathematics_lb,18.97,[],livebench_240701
|
193 |
-
191,mistral_7b_instruct_v0.2,mathematics_lb,16.04,[],livebench_240701
|
194 |
-
192,phi_3_mini_4k_instruct,mathematics_lb,19.88,[],livebench_240701
|
195 |
-
193,zephyr_7b_alpha,mathematics_lb,9.61,[],livebench_240701
|
196 |
-
194,phi_3_mini_128k_instruct,mathematics_lb,21.48,[],livebench_240701
|
197 |
-
195,zephyr_7b_beta,mathematics_lb,11.23,[],livebench_240701
|
198 |
-
196,deepseek_v2_lite_chat,mathematics_lb,11.99,[],livebench_240701
|
199 |
-
197,qwen1.5_7b_chat,mathematics_lb,12.86,[],livebench_240701
|
200 |
-
198,starling_lm_7b_beta,mathematics_lb,13.82,[],livebench_240701
|
201 |
-
199,vicuna_7b_v1.5_16k,mathematics_lb,6.61,[],livebench_240701
|
202 |
-
200,vicuna_7b_v1.5,mathematics_lb,4.33,[],livebench_240701
|
203 |
-
201,qwen1.5_4b_chat,mathematics_lb,7.08,[],livebench_240701
|
204 |
-
202,llama_2_7b_chat,mathematics_lb,4.78,[],livebench_240701
|
205 |
-
203,qwen2_1.5b_instruct,mathematics_lb,7.16,[],livebench_240701
|
206 |
-
204,yi_6b_chat,mathematics_lb,7.14,[],livebench_240701
|
207 |
-
205,qwen2_0.5b_instruct,mathematics_lb,4.22,[],livebench_240701
|
208 |
-
206,qwen1.5_1.8b_chat,mathematics_lb,2.14,[],livebench_240701
|
209 |
-
207,qwen1.5_0.5b_chat,mathematics_lb,3.39,[],livebench_240701
|
210 |
-
208,claude_3_5_sonnet_20240620,data_analysis_lb,56.74,[],livebench_240701
|
211 |
-
209,gpt_4o_2024_05_13,data_analysis_lb,52.41,[],livebench_240701
|
212 |
-
210,gpt_4_turbo_2024_04_09,data_analysis_lb,51.32,[],livebench_240701
|
213 |
-
211,gpt_4_1106_preview,data_analysis_lb,51.33,[],livebench_240701
|
214 |
-
212,claude_3_opus_20240229,data_analysis_lb,54.32,[],livebench_240701
|
215 |
-
213,gpt_4_0125_preview,data_analysis_lb,54.06,[],livebench_240701
|
216 |
-
214,deepseek_coder_v2,data_analysis_lb,38.25,[],livebench_240701
|
217 |
-
215,gemini_1.5_pro_api_0514,data_analysis_lb,52.81,[],livebench_240701
|
218 |
-
216,gemma_2_27b_it,data_analysis_lb,43.58,[],livebench_240701
|
219 |
-
217,gemini_1.5_flash_api_0514,data_analysis_lb,44.03,[],livebench_240701
|
220 |
-
218,qwen2_72b_instruct,data_analysis_lb,26.24,[],livebench_240701
|
221 |
-
219,acm_rewrite_qwen2_72b_chat,data_analysis_lb,26.19,[],livebench_240701
|
222 |
-
220,mistral_large_2402,data_analysis_lb,42.55,[],livebench_240701
|
223 |
-
221,deepseek_chat_v2,data_analysis_lb,38.03,[],livebench_240701
|
224 |
-
222,claude_3_sonnet_20240229,data_analysis_lb,44.56,[],livebench_240701
|
225 |
-
223,meta_llama_3_70b_instruct,data_analysis_lb,42.41,[],livebench_240701
|
226 |
-
224,claude_3_haiku_20240307,data_analysis_lb,41.54,[],livebench_240701
|
227 |
-
225,mixtral_8x22b_instruct_v0.1,data_analysis_lb,30.33,[],livebench_240701
|
228 |
-
226,gpt_3.5_turbo_0125,data_analysis_lb,41.21,[],livebench_240701
|
229 |
-
227,gpt_3.5_turbo_1106,data_analysis_lb,41.7,[],livebench_240701
|
230 |
-
228,command_r_plus,data_analysis_lb,24.6,[],livebench_240701
|
231 |
-
229,mistral_small_2402,data_analysis_lb,31.88,[],livebench_240701
|
232 |
-
230,gemma_2_9b_it,data_analysis_lb,35.06,[],livebench_240701
|
233 |
-
231,phi_3_medium_4k_instruct,data_analysis_lb,31.63,[],livebench_240701
|
234 |
-
232,phi_3_medium_128k_instruct,data_analysis_lb,32.12,[],livebench_240701
|
235 |
-
233,deepseek_coder_v2_lite_instruct,data_analysis_lb,33.0,[],livebench_240701
|
236 |
-
234,qwen1.5_110b_chat,data_analysis_lb,31.45,[],livebench_240701
|
237 |
-
235,qwen1.5_72b_chat,data_analysis_lb,32.98,[],livebench_240701
|
238 |
-
236,command_r,data_analysis_lb,31.69,[],livebench_240701
|
239 |
-
237,phi_3_small_128k_instruct,data_analysis_lb,27.33,[],livebench_240701
|
240 |
-
238,meta_llama_3_8b_instruct,data_analysis_lb,23.33,[],livebench_240701
|
241 |
-
239,qwen2_7b_instruct,data_analysis_lb,28.75,[],livebench_240701
|
242 |
-
240,phi_3_small_8k_instruct,data_analysis_lb,27.5,[],livebench_240701
|
243 |
-
241,openhermes_2.5_mistral_7b,data_analysis_lb,26.92,[],livebench_240701
|
244 |
-
242,mixtral_8x7b_instruct_v0.1,data_analysis_lb,28.13,[],livebench_240701
|
245 |
-
243,mistral_7b_instruct_v0.2,data_analysis_lb,14.62,[],livebench_240701
|
246 |
-
244,phi_3_mini_4k_instruct,data_analysis_lb,14.67,[],livebench_240701
|
247 |
-
245,zephyr_7b_alpha,data_analysis_lb,17.4,[],livebench_240701
|
248 |
-
246,phi_3_mini_128k_instruct,data_analysis_lb,8.69,[],livebench_240701
|
249 |
-
247,zephyr_7b_beta,data_analysis_lb,15.75,[],livebench_240701
|
250 |
-
248,deepseek_v2_lite_chat,data_analysis_lb,18.19,[],livebench_240701
|
251 |
-
249,qwen1.5_7b_chat,data_analysis_lb,16.23,[],livebench_240701
|
252 |
-
250,starling_lm_7b_beta,data_analysis_lb,2.0,[],livebench_240701
|
253 |
-
251,vicuna_7b_v1.5_16k,data_analysis_lb,9.27,[],livebench_240701
|
254 |
-
252,vicuna_7b_v1.5,data_analysis_lb,2.67,[],livebench_240701
|
255 |
-
253,qwen1.5_4b_chat,data_analysis_lb,9.13,[],livebench_240701
|
256 |
-
254,llama_2_7b_chat,data_analysis_lb,0.0,[],livebench_240701
|
257 |
-
255,qwen2_1.5b_instruct,data_analysis_lb,10.01,[],livebench_240701
|
258 |
-
256,yi_6b_chat,data_analysis_lb,4.38,[],livebench_240701
|
259 |
-
257,qwen2_0.5b_instruct,data_analysis_lb,2.0,[],livebench_240701
|
260 |
-
258,qwen1.5_1.8b_chat,data_analysis_lb,3.33,[],livebench_240701
|
261 |
-
259,qwen1.5_0.5b_chat,data_analysis_lb,0.0,[],livebench_240701
|
262 |
-
260,claude_3_5_sonnet_20240620,language_lb,56.94,[],livebench_240701
|
263 |
-
261,gpt_4o_2024_05_13,language_lb,53.94,[],livebench_240701
|
264 |
-
262,gpt_4_turbo_2024_04_09,language_lb,45.26,[],livebench_240701
|
265 |
-
263,gpt_4_1106_preview,language_lb,48.37,[],livebench_240701
|
266 |
-
264,claude_3_opus_20240229,language_lb,51.72,[],livebench_240701
|
267 |
-
265,gpt_4_0125_preview,language_lb,43.55,[],livebench_240701
|
268 |
-
266,deepseek_coder_v2,language_lb,33.04,[],livebench_240701
|
269 |
-
267,gemini_1.5_pro_api_0514,language_lb,38.25,[],livebench_240701
|
270 |
-
268,gemma_2_27b_it,language_lb,32.4,[],livebench_240701
|
271 |
-
269,gemini_1.5_flash_api_0514,language_lb,30.69,[],livebench_240701
|
272 |
-
270,qwen2_72b_instruct,language_lb,29.21,[],livebench_240701
|
273 |
-
271,acm_rewrite_qwen2_72b_chat,language_lb,30.03,[],livebench_240701
|
274 |
-
272,mistral_large_2402,language_lb,28.74,[],livebench_240701
|
275 |
-
273,deepseek_chat_v2,language_lb,32.29,[],livebench_240701
|
276 |
-
274,claude_3_sonnet_20240229,language_lb,38.08,[],livebench_240701
|
277 |
-
275,meta_llama_3_70b_instruct,language_lb,34.11,[],livebench_240701
|
278 |
-
276,claude_3_haiku_20240307,language_lb,30.07,[],livebench_240701
|
279 |
-
277,mixtral_8x22b_instruct_v0.1,language_lb,26.48,[],livebench_240701
|
280 |
-
278,gpt_3.5_turbo_0125,language_lb,24.22,[],livebench_240701
|
281 |
-
279,gpt_3.5_turbo_1106,language_lb,28.63,[],livebench_240701
|
282 |
-
280,command_r_plus,language_lb,23.92,[],livebench_240701
|
283 |
-
281,mistral_small_2402,language_lb,22.06,[],livebench_240701
|
284 |
-
282,gemma_2_9b_it,language_lb,27.64,[],livebench_240701
|
285 |
-
283,phi_3_medium_4k_instruct,language_lb,13.91,[],livebench_240701
|
286 |
-
284,phi_3_medium_128k_instruct,language_lb,12.76,[],livebench_240701
|
287 |
-
285,deepseek_coder_v2_lite_instruct,language_lb,10.64,[],livebench_240701
|
288 |
-
286,qwen1.5_110b_chat,language_lb,13.22,[],livebench_240701
|
289 |
-
287,qwen1.5_72b_chat,language_lb,11.37,[],livebench_240701
|
290 |
-
288,command_r,language_lb,14.64,[],livebench_240701
|
291 |
-
289,phi_3_small_128k_instruct,language_lb,12.28,[],livebench_240701
|
292 |
-
290,meta_llama_3_8b_instruct,language_lb,18.72,[],livebench_240701
|
293 |
-
291,qwen2_7b_instruct,language_lb,10.21,[],livebench_240701
|
294 |
-
292,phi_3_small_8k_instruct,language_lb,14.96,[],livebench_240701
|
295 |
-
293,openhermes_2.5_mistral_7b,language_lb,11.37,[],livebench_240701
|
296 |
-
294,mixtral_8x7b_instruct_v0.1,language_lb,13.76,[],livebench_240701
|
297 |
-
295,mistral_7b_instruct_v0.2,language_lb,9.05,[],livebench_240701
|
298 |
-
296,phi_3_mini_4k_instruct,language_lb,7.1,[],livebench_240701
|
299 |
-
297,zephyr_7b_alpha,language_lb,7.2,[],livebench_240701
|
300 |
-
298,phi_3_mini_128k_instruct,language_lb,6.8,[],livebench_240701
|
301 |
-
299,zephyr_7b_beta,language_lb,4.28,[],livebench_240701
|
302 |
-
300,deepseek_v2_lite_chat,language_lb,9.2,[],livebench_240701
|
303 |
-
301,qwen1.5_7b_chat,language_lb,6.18,[],livebench_240701
|
304 |
-
302,starling_lm_7b_beta,language_lb,7.26,[],livebench_240701
|
305 |
-
303,vicuna_7b_v1.5_16k,language_lb,7.92,[],livebench_240701
|
306 |
-
304,vicuna_7b_v1.5,language_lb,8.66,[],livebench_240701
|
307 |
-
305,qwen1.5_4b_chat,language_lb,5.8,[],livebench_240701
|
308 |
-
306,llama_2_7b_chat,language_lb,6.86,[],livebench_240701
|
309 |
-
307,qwen2_1.5b_instruct,language_lb,3.05,[],livebench_240701
|
310 |
-
308,yi_6b_chat,language_lb,4.69,[],livebench_240701
|
311 |
-
309,qwen2_0.5b_instruct,language_lb,2.8,[],livebench_240701
|
312 |
-
310,qwen1.5_1.8b_chat,language_lb,3.16,[],livebench_240701
|
313 |
-
311,qwen1.5_0.5b_chat,language_lb,2.88,[],livebench_240701
|
314 |
-
312,claude_3_5_sonnet_20240620,if_lb,72.3,[],livebench_240701
|
315 |
-
313,gpt_4o_2024_05_13,if_lb,72.17,[],livebench_240701
|
316 |
-
314,gpt_4_turbo_2024_04_09,if_lb,71.39,[],livebench_240701
|
317 |
-
315,gpt_4_1106_preview,if_lb,69.39,[],livebench_240701
|
318 |
-
316,claude_3_opus_20240229,if_lb,70.87,[],livebench_240701
|
319 |
-
317,gpt_4_0125_preview,if_lb,63.92,[],livebench_240701
|
320 |
-
318,deepseek_coder_v2,if_lb,67.18,[],livebench_240701
|
321 |
-
319,gemini_1.5_pro_api_0514,if_lb,67.2,[],livebench_240701
|
322 |
-
320,gemma_2_27b_it,if_lb,67.37,[],livebench_240701
|
323 |
-
321,gemini_1.5_flash_api_0514,if_lb,63.01,[],livebench_240701
|
324 |
-
322,qwen2_72b_instruct,if_lb,68.27,[],livebench_240701
|
325 |
-
323,acm_rewrite_qwen2_72b_chat,if_lb,65.0,[],livebench_240701
|
326 |
-
324,mistral_large_2402,if_lb,68.19,[],livebench_240701
|
327 |
-
325,deepseek_chat_v2,if_lb,64.34,[],livebench_240701
|
328 |
-
326,claude_3_sonnet_20240229,if_lb,65.0,[],livebench_240701
|
329 |
-
327,meta_llama_3_70b_instruct,if_lb,63.5,[],livebench_240701
|
330 |
-
328,claude_3_haiku_20240307,if_lb,64.03,[],livebench_240701
|
331 |
-
329,mixtral_8x22b_instruct_v0.1,if_lb,63.17,[],livebench_240701
|
332 |
-
330,gpt_3.5_turbo_0125,if_lb,60.47,[],livebench_240701
|
333 |
-
331,gpt_3.5_turbo_1106,if_lb,51.53,[],livebench_240701
|
334 |
-
332,command_r_plus,if_lb,71.51,[],livebench_240701
|
335 |
-
333,mistral_small_2402,if_lb,63.91,[],livebench_240701
|
336 |
-
334,gemma_2_9b_it,if_lb,61.55,[],livebench_240701
|
337 |
-
335,phi_3_medium_4k_instruct,if_lb,53.3,[],livebench_240701
|
338 |
-
336,phi_3_medium_128k_instruct,if_lb,56.15,[],livebench_240701
|
339 |
-
337,deepseek_coder_v2_lite_instruct,if_lb,48.34,[],livebench_240701
|
340 |
-
338,qwen1.5_110b_chat,if_lb,55.26,[],livebench_240701
|
341 |
-
339,qwen1.5_72b_chat,if_lb,58.25,[],livebench_240701
|
342 |
-
340,command_r,if_lb,57.16,[],livebench_240701
|
343 |
-
341,phi_3_small_128k_instruct,if_lb,36.88,[],livebench_240701
|
344 |
-
342,meta_llama_3_8b_instruct,if_lb,57.14,[],livebench_240701
|
345 |
-
343,qwen2_7b_instruct,if_lb,44.74,[],livebench_240701
|
346 |
-
344,phi_3_small_8k_instruct,if_lb,48.24,[],livebench_240701
|
347 |
-
345,openhermes_2.5_mistral_7b,if_lb,52.78,[],livebench_240701
|
348 |
-
346,mixtral_8x7b_instruct_v0.1,if_lb,44.81,[],livebench_240701
|
349 |
-
347,mistral_7b_instruct_v0.2,if_lb,51.65,[],livebench_240701
|
350 |
-
348,phi_3_mini_4k_instruct,if_lb,40.05,[],livebench_240701
|
351 |
-
349,zephyr_7b_alpha,if_lb,52.79,[],livebench_240701
|
352 |
-
350,phi_3_mini_128k_instruct,if_lb,49.65,[],livebench_240701
|
353 |
-
351,zephyr_7b_beta,if_lb,48.32,[],livebench_240701
|
354 |
-
352,deepseek_v2_lite_chat,if_lb,41.83,[],livebench_240701
|
355 |
-
353,qwen1.5_7b_chat,if_lb,44.12,[],livebench_240701
|
356 |
-
354,starling_lm_7b_beta,if_lb,38.32,[],livebench_240701
|
357 |
-
355,vicuna_7b_v1.5_16k,if_lb,42.12,[],livebench_240701
|
358 |
-
356,vicuna_7b_v1.5,if_lb,41.75,[],livebench_240701
|
359 |
-
357,qwen1.5_4b_chat,if_lb,27.75,[],livebench_240701
|
360 |
-
358,llama_2_7b_chat,if_lb,44.88,[],livebench_240701
|
361 |
-
359,qwen2_1.5b_instruct,if_lb,25.9,[],livebench_240701
|
362 |
-
360,yi_6b_chat,if_lb,27.22,[],livebench_240701
|
363 |
-
361,qwen2_0.5b_instruct,if_lb,26.63,[],livebench_240701
|
364 |
-
362,qwen1.5_1.8b_chat,if_lb,22.9,[],livebench_240701
|
365 |
-
363,qwen1.5_0.5b_chat,if_lb,21.3,[],livebench_240701
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/pointplot_granularity_matters.png
ADDED
cache/agreements_cache_42471fdfe00c7ff9b0aba18b66ab5a5f.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
|
2 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
3 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
4 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.10540925533894598,0.8005421074231263
|
5 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
6 |
+
grounding,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
7 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
8 |
+
planning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
9 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
10 |
+
refinement,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
11 |
+
safety,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
12 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
13 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
14 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
15 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
16 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
17 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
18 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
19 |
+
language_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
20 |
+
if_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
21 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
22 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
23 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
24 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
25 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
26 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
27 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
28 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
29 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
30 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
31 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
32 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
33 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
34 |
+
magi,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
35 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
36 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
37 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
38 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
39 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
40 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,top_aggregate,5,0,0.10540925533894598,0.8005421074231263
|
41 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,top_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
42 |
+
aggregate,holistic,grounding,biggen_240612,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
43 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
44 |
+
aggregate,holistic,planning,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
45 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
46 |
+
aggregate,holistic,refinement,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
47 |
+
aggregate,holistic,safety,biggen_240612,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
48 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
49 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
50 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
51 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
52 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
53 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
54 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
55 |
+
aggregate,holistic,language_average,livebench_240701,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
56 |
+
aggregate,holistic,if_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
57 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
58 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
59 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
60 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
61 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
62 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
63 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
64 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
65 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
66 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
67 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
68 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
69 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
70 |
+
aggregate,holistic,magi,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
71 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
72 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
73 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
cache/agreements_cache_6ac32881b7d0a3bf6d8762ff242ff449.csv
ADDED
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
|
2 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
3 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
4 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
5 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
6 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
7 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
8 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
9 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.0,1.0
|
10 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
11 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
12 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
13 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
14 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
15 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
16 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
17 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
18 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
19 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
20 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
21 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
22 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
23 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.5270462766947298,0.206507295485425
|
24 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
25 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
26 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
27 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.5270462766947298,0.206507295485425
|
28 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
29 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
30 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
31 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,9,-0.10540925533894598,0.8005421074231263
|
32 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,0,-0.10540925533894596,0.8005421074231263
|
33 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
34 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
35 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
36 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
37 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
38 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
39 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.0,1.0
|
40 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
|
41 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
42 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
43 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
44 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
45 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
46 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
47 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9486832980505137,0.02297740150320607
|
48 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
49 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
50 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
51 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
52 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
53 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
54 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,2,0.0,1.0
|
55 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
56 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
57 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9486832980505137,0.02297740150320607
|
58 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
59 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.10540925533894598,0.8005421074231263
|
60 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9486832980505137,0.02297740150320607
|
61 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
62 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
63 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
64 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
65 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
66 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
67 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
68 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
69 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
70 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
71 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
72 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
73 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
74 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7378647873726218,0.07697417298126676
|
75 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
76 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
77 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
78 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
79 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
80 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
81 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
82 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
83 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
84 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
85 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
86 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
87 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
88 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
89 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
|
90 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
91 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
92 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
93 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,1,0.19999999999999998,0.8166666666666667
|
94 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,2,-0.6,0.23333333333333334
|
95 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,3,0.0,1.0
|
96 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
97 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,5,0.0,1.0
|
98 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
99 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.6,0.23333333333333334
|
100 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
|
101 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
102 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
103 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
104 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
105 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
106 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
107 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
108 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
109 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
|
110 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
111 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
112 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
113 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
114 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
115 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
116 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
117 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
118 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
119 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,7,0.0,1.0
|
120 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
121 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
122 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
123 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
124 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
125 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
126 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
127 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
128 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
129 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
130 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
131 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
132 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
133 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7378647873726218,0.07697417298126676
|
134 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
135 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.5270462766947298,0.206507295485425
|
136 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7378647873726218,0.07697417298126676
|
137 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
138 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7378647873726218,0.07697417298126676
|
139 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
140 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7378647873726218,0.07697417298126676
|
141 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7378647873726218,0.07697417298126676
|
142 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
143 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
144 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
145 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
146 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.5270462766947298,0.206507295485425
|
147 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
148 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.5270462766947298,0.206507295485425
|
149 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7378647873726218,0.07697417298126676
|
150 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.5270462766947298,0.206507295485425
|
151 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
152 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
153 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
154 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
155 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
156 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
157 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
158 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
159 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
160 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
161 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
162 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
163 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
164 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
165 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
166 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
167 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
168 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
169 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
170 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
171 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
172 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
173 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
174 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
175 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
176 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
177 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
178 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
179 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
180 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
181 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
182 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
183 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
184 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
185 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.0,1.0
|
186 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.0,1.0
|
187 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
188 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
189 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
190 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.0,1.0
|
191 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
192 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
193 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
194 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
195 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
196 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
197 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
198 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
199 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
200 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
201 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,9,0.9486832980505138,0.02297740150320607
|
202 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
203 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
204 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
205 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
206 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
207 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
208 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
209 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
210 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
211 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
212 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947299,0.206507295485425
|
213 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
214 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
215 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
216 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
217 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
218 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
219 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
220 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
221 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
222 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947298,0.206507295485425
|
223 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
224 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
225 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
226 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
227 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
228 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
229 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
230 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
231 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
232 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
233 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
234 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
235 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
236 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
237 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
238 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
239 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
240 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
241 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
242 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
243 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
244 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
245 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
246 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
247 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
248 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
249 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
250 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
251 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
252 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947299,0.206507295485425
|
253 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
254 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
255 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
256 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
257 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
258 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
259 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
260 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
261 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
262 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
263 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
264 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,2,1.0,0.019176729141549043
|
265 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
266 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
267 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
268 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
269 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
270 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
271 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
272 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
273 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
274 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
275 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
276 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
277 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
278 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9486832980505137,0.02297740150320607
|
279 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
280 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
|
281 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
282 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
283 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
284 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
285 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
286 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
287 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
288 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
289 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.5270462766947299,0.206507295485425
|
290 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
291 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
292 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
293 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.0,1.0
|
294 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
295 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
296 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
297 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
298 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
299 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
300 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
301 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
302 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
303 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
304 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
305 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
306 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
307 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
308 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
309 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
310 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
|
311 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
312 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.8366600265340756,0.05220363534131463
|
313 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
314 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
315 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
316 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
317 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
318 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7378647873726218,0.07697417298126676
|
319 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.5270462766947298,0.206507295485425
|
320 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.3333333333333333,0.4349833603383296
|
321 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
322 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
323 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.5270462766947299,0.206507295485425
|
324 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
325 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
326 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
327 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
328 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.10540925533894596,0.8005421074231263
|
329 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
330 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.31622776601683794,0.44848886103153174
|
331 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
332 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
333 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
334 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
335 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
336 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
337 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
338 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
339 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
340 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
341 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
342 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
343 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
344 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
345 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
346 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
347 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
348 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
349 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
350 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
351 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
352 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
353 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
354 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,2,0.0,1.0
|
355 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
356 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
357 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
358 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
359 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
360 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
361 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
362 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
363 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
364 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
365 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
366 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
|
367 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
368 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
369 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,7,0.0,1.0
|
370 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
371 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
372 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
373 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
374 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
375 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
376 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
377 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
378 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
379 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
380 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
381 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
382 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
383 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,1,0.5270462766947298,0.206507295485425
|
384 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
385 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
386 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
387 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,5,0.5270462766947298,0.206507295485425
|
388 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
389 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
390 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
391 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,9,-0.10540925533894598,0.8005421074231263
|
392 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,0,-0.10540925533894596,0.8005421074231263
|
393 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
394 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
395 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
396 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
397 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
398 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
399 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,7,0.0,1.0
|
400 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,8,0.0,1.0
|
401 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
402 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
403 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
404 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
405 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
406 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
407 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,5,0.9486832980505137,0.02297740150320607
|
408 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
409 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
410 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
411 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
412 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
413 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
414 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,2,0.0,1.0
|
415 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
416 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
417 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,5,0.9486832980505137,0.02297740150320607
|
418 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
419 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,7,-0.10540925533894598,0.8005421074231263
|
420 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,8,0.9486832980505137,0.02297740150320607
|
421 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
422 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
423 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,1,0.6,0.23333333333333334
|
424 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
|
425 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
426 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
427 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
428 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
|
429 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
430 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
431 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
|
432 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
433 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
434 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,2,0.7378647873726218,0.07697417298126676
|
435 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
436 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
437 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
438 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
439 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
440 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
441 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
442 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
443 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
444 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
445 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
446 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
|
447 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
448 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
449 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
|
450 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
451 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
452 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
453 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,1,0.19999999999999998,0.8166666666666667
|
454 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,2,-0.6,0.23333333333333334
|
455 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,3,0.0,1.0
|
456 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
457 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,5,0.0,1.0
|
458 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
459 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,7,-0.6,0.23333333333333334
|
460 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
|
461 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
462 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
463 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
464 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
465 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
466 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
467 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
468 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
469 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
|
470 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
471 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
472 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
473 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
474 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
475 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
476 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
|
477 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
|
478 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
479 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,7,0.0,1.0
|
480 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
481 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
482 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
483 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
484 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
485 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
486 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
|
487 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
488 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
489 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
490 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
|
491 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
492 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
|
493 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,1,0.7378647873726218,0.07697417298126676
|
494 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
495 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,3,0.5270462766947298,0.206507295485425
|
496 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,4,0.7378647873726218,0.07697417298126676
|
497 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
498 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,6,0.7378647873726218,0.07697417298126676
|
499 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
500 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,8,0.7378647873726218,0.07697417298126676
|
501 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,9,0.7378647873726218,0.07697417298126676
|
502 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
503 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
504 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
505 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
506 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,4,0.5270462766947298,0.206507295485425
|
507 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
508 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,6,0.5270462766947298,0.206507295485425
|
509 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,7,0.7378647873726218,0.07697417298126676
|
510 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,8,0.5270462766947298,0.206507295485425
|
511 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
512 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
513 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
514 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
515 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
516 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
517 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
518 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
519 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
520 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
521 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
522 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
|
523 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
524 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
525 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
|
526 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
527 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
528 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
529 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
530 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
531 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
532 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
|
533 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
|
534 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
535 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
|
536 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
537 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
|
538 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
539 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
540 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
541 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
|
542 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
543 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
|
544 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
545 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,3,0.0,1.0
|
546 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,4,0.0,1.0
|
547 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
|
548 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
549 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
550 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,8,0.0,1.0
|
551 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
|
552 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
553 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
554 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
555 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
556 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
557 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,5,0.6,0.23333333333333334
|
558 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
559 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
560 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
561 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,9,0.9486832980505138,0.02297740150320607
|
562 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
563 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
564 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
565 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
566 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
567 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
568 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
569 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
570 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
571 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
572 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,0,0.5270462766947299,0.206507295485425
|
573 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
574 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
575 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
576 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
577 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
578 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
579 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
580 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
581 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
|
582 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,0,0.5270462766947298,0.206507295485425
|
583 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
584 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
585 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
586 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
587 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
588 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
589 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
590 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
591 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
592 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
593 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
594 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
595 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
596 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
597 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
598 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
599 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
600 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
601 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
602 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
603 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
604 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
605 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
606 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
607 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
608 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
609 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
610 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
611 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
612 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,0,0.5270462766947299,0.206507295485425
|
613 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
614 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
615 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
616 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
617 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
618 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
619 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
620 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
621 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
622 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
623 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
624 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,2,1.0,0.019176729141549043
|
625 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
626 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
627 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
628 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
629 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
630 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
631 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
632 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
633 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
634 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
635 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
636 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
637 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
638 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,6,0.9486832980505137,0.02297740150320607
|
639 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
640 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,8,0.0,1.0
|
641 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
642 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
643 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
|
644 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
645 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
646 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
647 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
648 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
649 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,7,0.5270462766947299,0.206507295485425
|
650 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
651 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
|
652 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
653 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,1,0.0,1.0
|
654 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
655 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
656 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
|
657 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
658 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
659 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
660 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
661 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
662 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
663 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
664 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
665 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
666 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
|
667 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
668 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
669 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
670 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,8,0.0,1.0
|
671 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
672 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,0,0.8366600265340756,0.05220363534131463
|
673 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
674 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
675 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
676 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
|
677 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
678 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,6,0.7378647873726218,0.07697417298126676
|
679 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,7,0.5270462766947298,0.206507295485425
|
680 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,8,0.3333333333333333,0.4349833603383296
|
681 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
|
682 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
683 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,1,0.5270462766947299,0.206507295485425
|
684 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
685 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
686 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
687 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
688 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,6,0.10540925533894596,0.8005421074231263
|
689 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
690 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,8,0.31622776601683794,0.44848886103153174
|
691 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
692 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
693 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
694 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
695 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
696 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
697 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
698 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
699 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
700 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
|
701 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
702 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
703 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
704 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
705 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
706 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
707 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
708 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
709 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
710 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
711 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
712 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
713 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
714 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,2,0.0,1.0
|
715 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
716 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
717 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
718 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
719 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
720 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,8,0.6,0.23333333333333334
|
721 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
|
cache/agreements_cache_9aca1000dd25da3a044f5fd80fad0266.csv
ADDED
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
|
2 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
3 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
4 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
5 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
6 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
7 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
8 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
9 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
10 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
11 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
12 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
13 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
14 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
15 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
16 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
17 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
18 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
19 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
20 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
21 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
22 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
23 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
24 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
25 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.10540925533894598,0.8005421074231263
|
26 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
27 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
28 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.10540925533894598,0.8005421074231263
|
29 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.10540925533894598,0.8005421074231263
|
30 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,8,-0.5270462766947298,0.206507295485425
|
31 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,9,-0.39999999999999997,0.48333333333333334
|
32 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
33 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
34 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
35 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
36 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
37 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
38 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
39 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
40 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
41 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
42 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
43 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
44 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
45 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
46 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
47 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
48 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
49 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
50 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
51 |
+
grounding,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
52 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
53 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
54 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
55 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
56 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9486832980505137,0.02297740150320607
|
57 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
58 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
59 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
60 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
61 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
62 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.0,1.0
|
63 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
64 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
65 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
66 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
67 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
68 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
69 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
70 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
71 |
+
planning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
72 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
73 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
74 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
75 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
76 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
77 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
78 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
79 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
80 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
81 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
82 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
83 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
84 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
85 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
86 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
87 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
88 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
89 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
90 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
91 |
+
refinement,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
92 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
93 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,1,0.0,1.0
|
94 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,2,-0.39999999999999997,0.48333333333333334
|
95 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
96 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,4,-0.6,0.23333333333333334
|
97 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
98 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,6,-0.19999999999999998,0.8166666666666667
|
99 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.39999999999999997,0.48333333333333334
|
100 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
|
101 |
+
safety,biggen_240612,aggregate,holistic,kendall,random,5,9,0.0,1.0
|
102 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
103 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
104 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
105 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
106 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
107 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
108 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
109 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
110 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
111 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
112 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
113 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
114 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
115 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
116 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
117 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
118 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
119 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
120 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
121 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
122 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
123 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
124 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
125 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
126 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
127 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
128 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
129 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
130 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
131 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
132 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7378647873726218,0.07697417298126676
|
133 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
134 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7378647873726218,0.07697417298126676
|
135 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.5270462766947298,0.206507295485425
|
136 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
137 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
138 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
139 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
140 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
141 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
142 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.5270462766947298,0.206507295485425
|
143 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7378647873726218,0.07697417298126676
|
144 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.5270462766947298,0.206507295485425
|
145 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
146 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
147 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
148 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
149 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
150 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
151 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
152 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
153 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
154 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
155 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
156 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
157 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
158 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
159 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
160 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
161 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
162 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
163 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
164 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
165 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
166 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
167 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
168 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
169 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
170 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
171 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
172 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
173 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
174 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
175 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
176 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
177 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
178 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
179 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
180 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
181 |
+
language_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
182 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,0,-0.39999999999999997,0.48333333333333334
|
183 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.19999999999999998,0.8166666666666667
|
184 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,2,-0.19999999999999998,0.8166666666666667
|
185 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,3,-0.6,0.23333333333333334
|
186 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
187 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.19999999999999998,0.8166666666666667
|
188 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.0,1.0
|
189 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
190 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
191 |
+
if_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
192 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
193 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
194 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
195 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
196 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
197 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
198 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
199 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
200 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
201 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
202 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
203 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
204 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
205 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
206 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
207 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
208 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
209 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
210 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
211 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
212 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
213 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
214 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
215 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
216 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
217 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
218 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
219 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
220 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
221 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
222 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
223 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
224 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
225 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
226 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
227 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
228 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
229 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
230 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
231 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
232 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
233 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
234 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
235 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9486832980505137,0.02297740150320607
|
236 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
237 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
238 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
239 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
240 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
241 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
242 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
243 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
244 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
245 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
246 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
|
247 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
248 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
249 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
250 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
251 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
252 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
253 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
254 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
255 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
256 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
257 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
258 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
259 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
260 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
261 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
262 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
263 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
264 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
265 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
266 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
267 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
268 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
269 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
270 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
271 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
272 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
273 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
274 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
275 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
276 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
277 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
278 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
279 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
280 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
281 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
282 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
283 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
284 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
285 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
286 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
287 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
288 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
289 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
290 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
291 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
292 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
293 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
294 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
295 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
296 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
297 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
298 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
299 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
300 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
301 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
302 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
303 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
304 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
305 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
306 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
307 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
308 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
309 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
310 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
|
311 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
312 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
313 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9486832980505137,0.02297740150320607
|
314 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
315 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
316 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9486832980505137,0.02297740150320607
|
317 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
318 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
|
319 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
320 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7378647873726218,0.07697417298126676
|
321 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
322 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
323 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
|
324 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
|
325 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
326 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
327 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
328 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
329 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
330 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
331 |
+
magi,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
|
332 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
333 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
334 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
335 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
336 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
337 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
338 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
339 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
|
340 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
341 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
342 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
343 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
344 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
345 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
346 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
347 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
348 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
349 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
350 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
351 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
352 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
|
353 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
354 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
355 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
|
356 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
357 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
|
358 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
359 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
360 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
361 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
362 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
363 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
364 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
365 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
366 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
367 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
368 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
369 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
370 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
371 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
372 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
373 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
374 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
375 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
376 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
377 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
378 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
379 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
380 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
|
381 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
|
382 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
383 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
|
384 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
385 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,3,0.10540925533894598,0.8005421074231263
|
386 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
387 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
388 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,6,0.10540925533894598,0.8005421074231263
|
389 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,7,0.10540925533894598,0.8005421074231263
|
390 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,8,-0.5270462766947298,0.206507295485425
|
391 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,9,-0.39999999999999997,0.48333333333333334
|
392 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
393 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
394 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
395 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
396 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
397 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
398 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
|
399 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
400 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
401 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
402 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
403 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
404 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
|
405 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
406 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
|
407 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
408 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
|
409 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
410 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
411 |
+
aggregate,holistic,grounding,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
412 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
413 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
414 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
|
415 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
416 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,4,0.9486832980505137,0.02297740150320607
|
417 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
418 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
419 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
420 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
421 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
422 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,0,0.0,1.0
|
423 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
424 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
425 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
426 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
|
427 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
428 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
|
429 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
430 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
431 |
+
aggregate,holistic,planning,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
432 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
433 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
434 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
|
435 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
436 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
437 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
438 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
439 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,7,0.6,0.23333333333333334
|
440 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
441 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
|
442 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
443 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
444 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
445 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
446 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
|
447 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
448 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
449 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
450 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
451 |
+
aggregate,holistic,refinement,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
452 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
453 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,1,0.0,1.0
|
454 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,2,-0.39999999999999997,0.48333333333333334
|
455 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
456 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,4,-0.6,0.23333333333333334
|
457 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
458 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,6,-0.19999999999999998,0.8166666666666667
|
459 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,7,-0.39999999999999997,0.48333333333333334
|
460 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
|
461 |
+
aggregate,holistic,safety,biggen_240612,kendall,random,5,9,0.0,1.0
|
462 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
463 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,1,0.6,0.23333333333333334
|
464 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
465 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
466 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
|
467 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
468 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
469 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
470 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
471 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
472 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,0,0.19999999999999998,0.8166666666666667
|
473 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
474 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
475 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
476 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
477 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
|
478 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
479 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,7,0.6,0.23333333333333334
|
480 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
481 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
482 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
|
483 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
484 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
485 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
|
486 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
|
487 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
488 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
489 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
490 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
|
491 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
492 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,0,0.7378647873726218,0.07697417298126676
|
493 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
494 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,2,0.7378647873726218,0.07697417298126676
|
495 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,3,0.5270462766947298,0.206507295485425
|
496 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
497 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
498 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
499 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
500 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
501 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
502 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,0,0.5270462766947298,0.206507295485425
|
503 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,1,0.7378647873726218,0.07697417298126676
|
504 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,2,0.5270462766947298,0.206507295485425
|
505 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,3,0.31622776601683794,0.44848886103153174
|
506 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
|
507 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,5,0.7378647873726218,0.07697417298126676
|
508 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
509 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
510 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
|
511 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
512 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
|
513 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
514 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
515 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
|
516 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
|
517 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
518 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
519 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
520 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
|
521 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
522 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
523 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
524 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
525 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
|
526 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
527 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
528 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
529 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
530 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
531 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
532 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
533 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
|
534 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
|
535 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
|
536 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
537 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
|
538 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
|
539 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,7,0.6,0.23333333333333334
|
540 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
541 |
+
aggregate,holistic,language_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
|
542 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,0,-0.39999999999999997,0.48333333333333334
|
543 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,1,0.19999999999999998,0.8166666666666667
|
544 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,2,-0.19999999999999998,0.8166666666666667
|
545 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,3,-0.6,0.23333333333333334
|
546 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,4,0.19999999999999998,0.8166666666666667
|
547 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,5,0.19999999999999998,0.8166666666666667
|
548 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,6,0.0,1.0
|
549 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
550 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,8,0.19999999999999998,0.8166666666666667
|
551 |
+
aggregate,holistic,if_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
|
552 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
553 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
554 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
555 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
556 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
557 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
558 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
559 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
560 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
561 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,9,0.6,0.23333333333333334
|
562 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
563 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
564 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
565 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
566 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
567 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
568 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
569 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
570 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
571 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
572 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
573 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
574 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
575 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
576 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
577 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
578 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
579 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
580 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
581 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
582 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
583 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
584 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
585 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
586 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
587 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
588 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
589 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
590 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
591 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
592 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
593 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
594 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
595 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,3,0.9486832980505137,0.02297740150320607
|
596 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
597 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
598 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
599 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
|
600 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
601 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
602 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
603 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
604 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
605 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
606 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
|
607 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
608 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,6,0.19999999999999998,0.8166666666666667
|
609 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
610 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
611 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
612 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
|
613 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
|
614 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
615 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
616 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
617 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
618 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
619 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
620 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
621 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
622 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,0,0.9486832980505137,0.02297740150320607
|
623 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
|
624 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
625 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
626 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
627 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
628 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
629 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
630 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
631 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
632 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
|
633 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
634 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
635 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
636 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
637 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
638 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
639 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
640 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
641 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
642 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
643 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
644 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
645 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
646 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
647 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
648 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
649 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
650 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
651 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
652 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
653 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
|
654 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,2,0.39999999999999997,0.48333333333333334
|
655 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
656 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
657 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
658 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
659 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
660 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
661 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
662 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
663 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
664 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
665 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
666 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
667 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
668 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
669 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,7,0.19999999999999998,0.8166666666666667
|
670 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
|
671 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
|
672 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
673 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,1,0.9486832980505137,0.02297740150320607
|
674 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
675 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
676 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,4,0.9486832980505137,0.02297740150320607
|
677 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
|
678 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
|
679 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
680 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,8,0.7378647873726218,0.07697417298126676
|
681 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
682 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
|
683 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
|
684 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
|
685 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
|
686 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
687 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
|
688 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
689 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
690 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
|
691 |
+
aggregate,holistic,magi,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
|
692 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
693 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
694 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
|
695 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
|
696 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
|
697 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
|
698 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
699 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
|
700 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
|
701 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
702 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
|
703 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
|
704 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
705 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
|
706 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
707 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
|
708 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
|
709 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
|
710 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
711 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
|
712 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
|
713 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
|
714 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
|
715 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
|
716 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
|
717 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
|
718 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
|
719 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
|
720 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
|
721 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
|
cache/agreements_cache_a8b645e4d5ba862fbfa9ef3ecf73b44c.csv
ADDED
@@ -0,0 +1,721 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
|
2 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
|
3 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
4 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
5 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
6 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
7 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
8 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
9 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
10 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
11 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
12 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.39999999999999997,0.48333333333333334
|
13 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.5270462766947298,0.206507295485425
|
14 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.5270462766947298,0.206507295485425
|
15 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.5270462766947298,0.206507295485425
|
16 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.22360679774997896,0.6015081344405899
|
17 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
|
18 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.10540925533894598,0.8005421074231263
|
19 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
20 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
21 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.22360679774997896,0.6015081344405899
|
22 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.10540925533894598,0.8005421074231263
|
23 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.31622776601683794,0.44848886103153174
|
24 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.31622776601683794,0.44848886103153174
|
25 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.10540925533894598,0.8005421074231263
|
26 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
|
27 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
|
28 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
29 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
30 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.10540925533894598,0.8005421074231263
|
31 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
32 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
33 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
34 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
|
35 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.19999999999999998,0.8166666666666667
|
36 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
37 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
38 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
|
39 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
40 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
|
41 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
|
42 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
43 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
44 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
|
45 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
|
46 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.9486832980505137,0.02297740150320607
|
47 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
|
48 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
|
49 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
50 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
51 |
+
grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
|
52 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
|
53 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
54 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
55 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
|
56 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.8944271909999159,0.0367138563627041
|
57 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
|
58 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.9486832980505137,0.02297740150320607
|
59 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.0,1.0
|
60 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.8944271909999159,0.0367138563627041
|
61 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
62 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
63 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
64 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
65 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
66 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
67 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
68 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
69 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
|
70 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
71 |
+
planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
72 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
73 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
74 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
75 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
76 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
|
77 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
78 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
|
79 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
80 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
|
81 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
82 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
83 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
84 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
85 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
86 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
87 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
88 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
89 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
90 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
91 |
+
refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
92 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
|
93 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
|
94 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
95 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
|
96 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.19999999999999998,0.8166666666666667
|
97 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
|
98 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
99 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
|
100 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
101 |
+
safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
102 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
103 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
104 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
105 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
|
106 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
107 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
108 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
109 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.0,1.0
|
110 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
111 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
112 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
113 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
114 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.9999999999999999,0.016666666666666666
|
115 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
|
116 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
117 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
|
118 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
119 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
120 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
121 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
122 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
123 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
124 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
125 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
126 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
127 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
128 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
129 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
130 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
131 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
132 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
133 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.5270462766947298,0.206507295485425
|
134 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
135 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
|
136 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
|
137 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
138 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7378647873726218,0.07697417298126676
|
139 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
140 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
141 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
142 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
143 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
144 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
145 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
|
146 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
147 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.5270462766947298,0.206507295485425
|
148 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
|
149 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.5270462766947298,0.206507295485425
|
150 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
151 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
152 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
153 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
154 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
155 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
156 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
157 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
158 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
159 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
160 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
161 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
162 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
163 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
164 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
165 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
166 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
167 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
168 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
169 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
170 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
171 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
172 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
173 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
174 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
175 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
176 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
177 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
178 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
179 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
180 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
181 |
+
language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
182 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
|
183 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
184 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
185 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.19999999999999998,0.8166666666666667
|
186 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
187 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
188 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
189 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
190 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
|
191 |
+
if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
|
192 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
193 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.9999999999999999,0.016666666666666666
|
194 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
195 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
196 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.9999999999999999,0.016666666666666666
|
197 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
198 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
199 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.9999999999999999,0.016666666666666666
|
200 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
201 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
202 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
203 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
204 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
205 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
206 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
207 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
208 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
209 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
210 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
211 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
212 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
213 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
214 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
215 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
216 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
217 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
|
218 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
219 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
220 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
221 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
222 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
223 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
224 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
225 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
|
226 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.0,1.0
|
227 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
228 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
|
229 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
230 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
231 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
232 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.31622776601683794,0.44848886103153174
|
233 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.7999999999999999,0.08333333333333333
|
234 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
235 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
236 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
237 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
238 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
239 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
|
240 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
241 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.31622776601683794,0.44848886103153174
|
242 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
243 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
244 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
245 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
|
246 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
247 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
|
248 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
249 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
250 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
|
251 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
252 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
|
253 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
254 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
255 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
|
256 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
257 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
258 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
259 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
260 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
|
261 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.0,1.0
|
262 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
263 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
264 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
|
265 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.9999999999999999,0.016666666666666666
|
266 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
267 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.31622776601683794,0.44848886103153174
|
268 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
|
269 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
270 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
271 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
272 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
273 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
274 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.39999999999999997,0.48333333333333334
|
275 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
|
276 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.0,1.0
|
277 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.39999999999999997,0.48333333333333334
|
278 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
279 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
280 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
281 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
282 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
283 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
284 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
|
285 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
|
286 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
|
287 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
288 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
289 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
290 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
291 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
292 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
293 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
294 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
|
295 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
296 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
297 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
298 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
299 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
|
300 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
301 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
302 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.6,0.23333333333333334
|
303 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
304 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
305 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
306 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
307 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
308 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
309 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
|
310 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
|
311 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
312 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.11952286093343936,0.7815112949987133
|
313 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
|
314 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
|
315 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
316 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
317 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
318 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.35856858280031806,0.40538055645894233
|
319 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.31622776601683794,0.44848886103153174
|
320 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
|
321 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
322 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
323 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
324 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.10540925533894598,0.8005421074231263
|
325 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
326 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
327 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
328 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
329 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
330 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
331 |
+
magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
|
332 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
333 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
334 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
|
335 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
336 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
337 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
338 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
339 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
340 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
341 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
342 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
343 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
|
344 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
|
345 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
346 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
347 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
348 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.10540925533894598,0.8005421074231263
|
349 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
|
350 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
351 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
352 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
353 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
354 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
355 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
356 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
357 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
358 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
359 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
360 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
361 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
362 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,0,0.0,1.0
|
363 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
364 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
365 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
366 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
367 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
368 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
369 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
370 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
371 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
372 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.39999999999999997,0.48333333333333334
|
373 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.5270462766947298,0.206507295485425
|
374 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,2,0.5270462766947298,0.206507295485425
|
375 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,3,0.5270462766947298,0.206507295485425
|
376 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.22360679774997896,0.6015081344405899
|
377 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
|
378 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.10540925533894598,0.8005421074231263
|
379 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
380 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
381 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.22360679774997896,0.6015081344405899
|
382 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,0,0.10540925533894598,0.8005421074231263
|
383 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.31622776601683794,0.44848886103153174
|
384 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.31622776601683794,0.44848886103153174
|
385 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,3,0.10540925533894598,0.8005421074231263
|
386 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
|
387 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
|
388 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
389 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
390 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,8,0.10540925533894598,0.8005421074231263
|
391 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
392 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
393 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
394 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
|
395 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.19999999999999998,0.8166666666666667
|
396 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
397 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
398 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
|
399 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
400 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
|
401 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
|
402 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
403 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
404 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
|
405 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
|
406 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,4,0.9486832980505137,0.02297740150320607
|
407 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
|
408 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
|
409 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
410 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
411 |
+
aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
|
412 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,0,0.0,1.0
|
413 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
414 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
415 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
|
416 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,4,0.8944271909999159,0.0367138563627041
|
417 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
|
418 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,6,0.9486832980505137,0.02297740150320607
|
419 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,7,0.0,1.0
|
420 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,8,0.8944271909999159,0.0367138563627041
|
421 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
422 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
423 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
424 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
425 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
426 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
427 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
428 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
429 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
|
430 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
431 |
+
aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
432 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
433 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
434 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
435 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
436 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
|
437 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
438 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
|
439 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
440 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,8,0.0,1.0
|
441 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
442 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
443 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
444 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
445 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
446 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
447 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
448 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
449 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
450 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
451 |
+
aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
452 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,0,0.0,1.0
|
453 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,1,0.0,1.0
|
454 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
455 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,3,0.0,1.0
|
456 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,4,-0.19999999999999998,0.8166666666666667
|
457 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
|
458 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
459 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
|
460 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
461 |
+
aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
462 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
463 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
464 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
465 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,3,0.0,1.0
|
466 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
467 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
468 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
469 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,7,0.0,1.0
|
470 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
471 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
472 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
473 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
474 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,2,0.9999999999999999,0.016666666666666666
|
475 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
|
476 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
477 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
|
478 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
479 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
480 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
481 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
482 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
483 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
484 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
485 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
486 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
487 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
488 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
489 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
490 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
491 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
492 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
493 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.5270462766947298,0.206507295485425
|
494 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
|
495 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
|
496 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
|
497 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
498 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.7378647873726218,0.07697417298126676
|
499 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
500 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
501 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
502 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
503 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
504 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
505 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
|
506 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
507 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.5270462766947298,0.206507295485425
|
508 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
|
509 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.5270462766947298,0.206507295485425
|
510 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
511 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
512 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
513 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
514 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
515 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
516 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
517 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
518 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
519 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
520 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
521 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
522 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
523 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
524 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
|
525 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
526 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
527 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
528 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
529 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
530 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
531 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
532 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
533 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
534 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
535 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
536 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
537 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
538 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
539 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
540 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
541 |
+
aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
542 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.0,1.0
|
543 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
544 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
545 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.19999999999999998,0.8166666666666667
|
546 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
547 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
548 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
549 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
550 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.0,1.0
|
551 |
+
aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
|
552 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
553 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,1,0.9999999999999999,0.016666666666666666
|
554 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
555 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
556 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,4,0.9999999999999999,0.016666666666666666
|
557 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
558 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
559 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,7,0.9999999999999999,0.016666666666666666
|
560 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
561 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
562 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
563 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
|
564 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
565 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
566 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
567 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
568 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
569 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
570 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
|
571 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
572 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
573 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
574 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
575 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
576 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
|
577 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
|
578 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
|
579 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
|
580 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
581 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
582 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
583 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
584 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
|
585 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
|
586 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,4,0.0,1.0
|
587 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
588 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
|
589 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
590 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
591 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
|
592 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.31622776601683794,0.44848886103153174
|
593 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.7999999999999999,0.08333333333333333
|
594 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
595 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
|
596 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
597 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
598 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
599 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
|
600 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
601 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,9,0.31622776601683794,0.44848886103153174
|
602 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
603 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
604 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
|
605 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
|
606 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
607 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
|
608 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
609 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
610 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
|
611 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
612 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,0,0.0,1.0
|
613 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
614 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
|
615 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,3,0.0,1.0
|
616 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
617 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
618 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
619 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
620 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
|
621 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,9,0.0,1.0
|
622 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
623 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
|
624 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
|
625 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.9999999999999999,0.016666666666666666
|
626 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
627 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,5,0.31622776601683794,0.44848886103153174
|
628 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
|
629 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
630 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
|
631 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
632 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
|
633 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
|
634 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.39999999999999997,0.48333333333333334
|
635 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
|
636 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,4,0.0,1.0
|
637 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,5,-0.39999999999999997,0.48333333333333334
|
638 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
|
639 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
640 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
|
641 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
642 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
|
643 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
644 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
|
645 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,3,0.0,1.0
|
646 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
|
647 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
648 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
649 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
650 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
651 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
652 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
653 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
654 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
|
655 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
656 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
|
657 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
658 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
659 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
|
660 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
|
661 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
662 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.6,0.23333333333333334
|
663 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
|
664 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
665 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
666 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
|
667 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
668 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
|
669 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
|
670 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
|
671 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
|
672 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,0,0.11952286093343936,0.7815112949987133
|
673 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,1,0.0,1.0
|
674 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
|
675 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
676 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
677 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
678 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,6,0.35856858280031806,0.40538055645894233
|
679 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,7,0.31622776601683794,0.44848886103153174
|
680 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
|
681 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
682 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
683 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
|
684 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.10540925533894598,0.8005421074231263
|
685 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
686 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
687 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
|
688 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
689 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
|
690 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
691 |
+
aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
|
692 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
693 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
694 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
|
695 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
696 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
697 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
|
698 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
|
699 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
|
700 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
701 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
|
702 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
|
703 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,1,0.0,1.0
|
704 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
|
705 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
|
706 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
|
707 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
|
708 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,6,0.10540925533894598,0.8005421074231263
|
709 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
|
710 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
|
711 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
|
712 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
713 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
|
714 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
|
715 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
|
716 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
|
717 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
|
718 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
|
719 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
|
720 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
|
721 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
|
cache/agreements_cache_facdc1028ee0edd9aed491afc51b884d.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
|
2 |
+
hellaswag,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
3 |
+
humaneval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
4 |
+
mbpp,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
5 |
+
winogrande,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
6 |
+
grounding,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
7 |
+
instruction_following,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
8 |
+
planning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
9 |
+
reasoning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
10 |
+
refinement,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
11 |
+
safety,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
12 |
+
theory_of_mind,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
13 |
+
tool_usage,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
14 |
+
livebench_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
15 |
+
reasoning_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
16 |
+
coding_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
17 |
+
mathematics_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
18 |
+
data_analysis_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
19 |
+
language_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
20 |
+
if_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
21 |
+
arena_hard,arena_hard_2404,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
22 |
+
mixeval,mixeval_240601,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
23 |
+
agieval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
24 |
+
arc_c,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
25 |
+
alpacav1,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
26 |
+
alpacav2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
27 |
+
alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
28 |
+
arena_elo,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
29 |
+
bbh,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
30 |
+
eq_benchv2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
31 |
+
gpt4all,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.0,1.0
|
32 |
+
hugging_6,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
33 |
+
llmonitor,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
34 |
+
magi,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
35 |
+
mmlu,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
36 |
+
mt_bench,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
37 |
+
biggen_mwr,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
38 |
+
aggregate,holistic,hellaswag,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
39 |
+
aggregate,holistic,humaneval,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
40 |
+
aggregate,holistic,mbpp,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
41 |
+
aggregate,holistic,winogrande,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
42 |
+
aggregate,holistic,grounding,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
43 |
+
aggregate,holistic,instruction_following,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
|
44 |
+
aggregate,holistic,planning,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
45 |
+
aggregate,holistic,reasoning,biggen_240612,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
46 |
+
aggregate,holistic,refinement,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
47 |
+
aggregate,holistic,safety,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
48 |
+
aggregate,holistic,theory_of_mind,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
49 |
+
aggregate,holistic,tool_usage,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
50 |
+
aggregate,holistic,livebench_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
51 |
+
aggregate,holistic,reasoning_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
52 |
+
aggregate,holistic,coding_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
53 |
+
aggregate,holistic,mathematics_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
54 |
+
aggregate,holistic,data_analysis_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
55 |
+
aggregate,holistic,language_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
56 |
+
aggregate,holistic,if_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
57 |
+
aggregate,holistic,arena_hard,arena_hard_2404,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
58 |
+
aggregate,holistic,mixeval,mixeval_240601,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
|
59 |
+
aggregate,holistic,agieval,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
60 |
+
aggregate,holistic,arc_c,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
61 |
+
aggregate,holistic,alpacav1,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
62 |
+
aggregate,holistic,alpacav2,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
63 |
+
aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
64 |
+
aggregate,holistic,arena_elo,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
65 |
+
aggregate,holistic,bbh,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
|
66 |
+
aggregate,holistic,eq_benchv2,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
|
67 |
+
aggregate,holistic,gpt4all,BLZ_240312,kendall,top_aggregate,5,0,0.0,1.0
|
68 |
+
aggregate,holistic,hugging_6,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
69 |
+
aggregate,holistic,llmonitor,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
70 |
+
aggregate,holistic,magi,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
|
71 |
+
aggregate,holistic,mmlu,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
|
72 |
+
aggregate,holistic,mt_bench,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
|
73 |
+
aggregate,holistic,biggen_mwr,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/IBM/benchbench.git
|
2 |
+
streamlit
|
3 |
+
plotly
|