Fix MATH
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ DESCRIPTION = f"""
|
|
10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
11 |
"""
|
12 |
|
13 |
-
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
|
14 |
|
15 |
|
16 |
def get_leaderboard_df(agg: str = "max"):
|
@@ -100,6 +100,8 @@ def get_leaderboard_df(agg: str = "max"):
|
|
100 |
"aimo_kaggle_tora_hard",
|
101 |
"aimo_kaggle_tora_medium_extended",
|
102 |
"aimo_kaggle_tora_hard_extended",
|
|
|
|
|
103 |
]:
|
104 |
for k, v in data["results"].items():
|
105 |
value = float(v["qem"]) / 100.0
|
@@ -122,7 +124,7 @@ def get_leaderboard_df(agg: str = "max"):
|
|
122 |
# Trim AIMO column names
|
123 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
124 |
|
125 |
-
df = df.reset_index().rename(columns={"index": "Model"}).round(
|
126 |
# Strip off date from model name
|
127 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
128 |
|
|
|
10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
11 |
"""
|
12 |
|
13 |
+
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5"]
|
14 |
|
15 |
|
16 |
def get_leaderboard_df(agg: str = "max"):
|
|
|
100 |
"aimo_kaggle_tora_hard",
|
101 |
"aimo_kaggle_tora_medium_extended",
|
102 |
"aimo_kaggle_tora_hard_extended",
|
103 |
+
"aimo_math_integer_lvl4",
|
104 |
+
"aimo_math_integer_lvl5",
|
105 |
]:
|
106 |
for k, v in data["results"].items():
|
107 |
value = float(v["qem"]) / 100.0
|
|
|
124 |
# Trim AIMO column names
|
125 |
df.columns = [c.replace("aimo_", "") for c in df.columns]
|
126 |
|
127 |
+
df = df.reset_index().rename(columns={"index": "Model"}).round(4)
|
128 |
# Strip off date from model name
|
129 |
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
|
130 |
|