open-r1-eval-leaderboard

Running

lewtun HF staff commited on Jun 25, 2024

Commit

cf8d412

1 Parent(s): f89c0b9

Fix MATH

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ DESCRIPTION = f"""
 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
-BENCHMARKS_TO_SKIP = ["math", "mini_math"]
 def get_leaderboard_df(agg: str = "max"):
@@ -100,6 +100,8 @@ def get_leaderboard_df(agg: str = "max"):
                 "aimo_kaggle_tora_hard",
                 "aimo_kaggle_tora_medium_extended",
                 "aimo_kaggle_tora_hard_extended",
             ]:
                 for k, v in data["results"].items():
                     value = float(v["qem"]) / 100.0
@@ -122,7 +124,7 @@ def get_leaderboard_df(agg: str = "max"):
     # Trim AIMO column names
     df.columns = [c.replace("aimo_", "") for c in df.columns]
-    df = df.reset_index().rename(columns={"index": "Model"}).round(2)
     # Strip off date from model name
     df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])

 Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
 """
+BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5"]
 def get_leaderboard_df(agg: str = "max"):
                 "aimo_kaggle_tora_hard",
                 "aimo_kaggle_tora_medium_extended",
                 "aimo_kaggle_tora_hard_extended",
+                "aimo_math_integer_lvl4",
+                "aimo_math_integer_lvl5",
             ]:
                 for k, v in data["results"].items():
                     value = float(v["qem"]) / 100.0
     # Trim AIMO column names
     df.columns = [c.replace("aimo_", "") for c in df.columns]
+    df = df.reset_index().rename(columns={"index": "Model"}).round(4)
     # Strip off date from model name
     df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])