Fix MixEval
Browse files
app.py
CHANGED
@@ -36,75 +36,80 @@ def get_leaderboard_df():
|
|
36 |
|
37 |
with open(filepath, "r") as file:
|
38 |
data = json.load(file)
|
39 |
-
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
40 |
# Skip benchmarks that we don't want to include in the leaderboard
|
41 |
if task.lower() in BENCHMARKS_TO_SKIP:
|
42 |
continue
|
43 |
-
#
|
44 |
-
if task.lower()
|
45 |
-
value = data["
|
46 |
-
|
47 |
-
elif task.lower() == "ifeval":
|
48 |
-
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
49 |
-
# MMLU has several metrics but we report just the average one
|
50 |
-
elif task.lower() == "mmlu":
|
51 |
-
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
52 |
-
# HellaSwag and ARC reports acc_norm
|
53 |
-
elif task.lower() in ["hellaswag", "arc"]:
|
54 |
-
value = data["results"][first_result_key]["acc_norm"]
|
55 |
-
# BBH has several metrics but we report just the average one
|
56 |
-
elif task.lower() == "bbh":
|
57 |
-
if "all" in data["results"]:
|
58 |
-
value = data["results"]["all"]["acc"]
|
59 |
-
else:
|
60 |
-
value = -100
|
61 |
-
# AGIEval reports acc_norm
|
62 |
-
elif task.lower() == "agieval":
|
63 |
-
value = data["results"]["all"]["acc_norm"]
|
64 |
-
# MATH reports qem
|
65 |
-
elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
66 |
-
value = data["results"]["all"]["qem"]
|
67 |
-
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
68 |
-
elif task.lower() in ["mini_math_v2"]:
|
69 |
-
for k, v in data["results"].items():
|
70 |
-
if k != "all":
|
71 |
-
level = k.split("|")[1].split(":")[-1]
|
72 |
-
value = v["qem"]
|
73 |
-
df.loc[model_revision, f"{task}_{level}"] = value
|
74 |
-
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
75 |
-
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
|
76 |
-
for k, v in data["results"].items():
|
77 |
-
if k != "all" and "_average" not in k:
|
78 |
-
version = k.split("|")[1].split(":")[-1]
|
79 |
-
value = v["qem"] if "qem" in v else v["score"]
|
80 |
-
df.loc[model_revision, f"{task}_{version}"] = value
|
81 |
-
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
|
82 |
-
elif task.lower() in [
|
83 |
-
"aimo_tora_eval_kaggle_medium",
|
84 |
-
"aimo_tora_eval_kaggle_hard",
|
85 |
-
"aimo_kaggle_fast_eval_hard",
|
86 |
-
"aimo_kaggle_tora_medium",
|
87 |
-
"aimo_kaggle_tora_hard",
|
88 |
-
"aimo_kaggle_tora_medium_extended",
|
89 |
-
"aimo_kaggle_tora_hard_extended",
|
90 |
-
"aimo_math_integer_lvl4",
|
91 |
-
"aimo_math_integer_lvl5",
|
92 |
-
]:
|
93 |
-
for k, v in data["results"].items():
|
94 |
-
value = float(v["qem"]) / 100.0
|
95 |
-
df.loc[model_revision, f"{task}"] = value
|
96 |
-
# For AlpacaEval we report base winrate and lenght corrected one
|
97 |
-
elif task.lower() == "alpaca_eval":
|
98 |
-
value = data["results"][first_result_key]["win_rate"]
|
99 |
-
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
|
100 |
-
value = data["results"][first_result_key]["length_controlled_winrate"]
|
101 |
-
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
102 |
else:
|
103 |
-
|
104 |
-
|
105 |
-
)
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# Drop rows where every entry is NaN
|
110 |
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
|
|
|
36 |
|
37 |
with open(filepath, "r") as file:
|
38 |
data = json.load(file)
|
|
|
39 |
# Skip benchmarks that we don't want to include in the leaderboard
|
40 |
if task.lower() in BENCHMARKS_TO_SKIP:
|
41 |
continue
|
42 |
+
# MixEval doen't have a results key, so we need to get the overall score
|
43 |
+
if task.lower() in ["mixeval", "mixeval_hard"]:
|
44 |
+
value = data["overall score (final score)"]
|
45 |
+
df.loc[model_revision, f"{task}"] = value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
else:
|
47 |
+
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
48 |
+
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
49 |
+
if task.lower() == "truthfulqa":
|
50 |
+
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
51 |
+
# IFEval has several metrics but we report just the prompt-loose-acc one
|
52 |
+
elif task.lower() == "ifeval":
|
53 |
+
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
54 |
+
# MMLU has several metrics but we report just the average one
|
55 |
+
elif task.lower() == "mmlu":
|
56 |
+
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
57 |
+
# HellaSwag and ARC reports acc_norm
|
58 |
+
elif task.lower() in ["hellaswag", "arc"]:
|
59 |
+
value = data["results"][first_result_key]["acc_norm"]
|
60 |
+
# BBH has several metrics but we report just the average one
|
61 |
+
elif task.lower() == "bbh":
|
62 |
+
if "all" in data["results"]:
|
63 |
+
value = data["results"]["all"]["acc"]
|
64 |
+
else:
|
65 |
+
value = -100
|
66 |
+
# AGIEval reports acc_norm
|
67 |
+
elif task.lower() == "agieval":
|
68 |
+
value = data["results"]["all"]["acc_norm"]
|
69 |
+
# MATH reports qem
|
70 |
+
elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
71 |
+
value = data["results"]["all"]["qem"]
|
72 |
+
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
73 |
+
elif task.lower() in ["mini_math_v2"]:
|
74 |
+
for k, v in data["results"].items():
|
75 |
+
if k != "all":
|
76 |
+
level = k.split("|")[1].split(":")[-1]
|
77 |
+
value = v["qem"]
|
78 |
+
df.loc[model_revision, f"{task}_{level}"] = value
|
79 |
+
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
80 |
+
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
|
81 |
+
for k, v in data["results"].items():
|
82 |
+
if k != "all" and "_average" not in k:
|
83 |
+
version = k.split("|")[1].split(":")[-1]
|
84 |
+
value = v["qem"] if "qem" in v else v["score"]
|
85 |
+
df.loc[model_revision, f"{task}_{version}"] = value
|
86 |
+
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
|
87 |
+
elif task.lower() in [
|
88 |
+
"aimo_tora_eval_kaggle_medium",
|
89 |
+
"aimo_tora_eval_kaggle_hard",
|
90 |
+
"aimo_kaggle_fast_eval_hard",
|
91 |
+
"aimo_kaggle_tora_medium",
|
92 |
+
"aimo_kaggle_tora_hard",
|
93 |
+
"aimo_kaggle_tora_medium_extended",
|
94 |
+
"aimo_kaggle_tora_hard_extended",
|
95 |
+
"aimo_math_integer_lvl4",
|
96 |
+
"aimo_math_integer_lvl5",
|
97 |
+
]:
|
98 |
+
for k, v in data["results"].items():
|
99 |
+
value = float(v["qem"]) / 100.0
|
100 |
+
df.loc[model_revision, f"{task}"] = value
|
101 |
+
# For AlpacaEval we report base winrate and lenght corrected one
|
102 |
+
elif task.lower() == "alpaca_eval":
|
103 |
+
value = data["results"][first_result_key]["win_rate"]
|
104 |
+
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
|
105 |
+
value = data["results"][first_result_key]["length_controlled_winrate"]
|
106 |
+
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
107 |
+
else:
|
108 |
+
first_metric_key = next(
|
109 |
+
iter(data["results"][first_result_key])
|
110 |
+
) # gets the first key in the first result
|
111 |
+
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
112 |
+
df.loc[model_revision, task] = float(value)
|
113 |
|
114 |
# Drop rows where every entry is NaN
|
115 |
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
|