open-r1-eval-leaderboard

Running

App Files Files Community

lewtun HF staff commited on Jul 2, 2024

Commit

a17f702

1 Parent(s): c809f5c

Fix MixEval

Browse files

Files changed (1) hide show

app.py +70 -65

app.py CHANGED Viewed

@@ -36,75 +36,80 @@ def get_leaderboard_df():
         with open(filepath, "r") as file:
             data = json.load(file)
-            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
             # Skip benchmarks that we don't want to include in the leaderboard
             if task.lower() in BENCHMARKS_TO_SKIP:
                 continue
-            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
-            if task.lower() == "truthfulqa":
-                value = data["results"][first_result_key]["truthfulqa_mc2"]
-            # IFEval has several metrics but we report just the prompt-loose-acc one
-            elif task.lower() == "ifeval":
-                value = data["results"][first_result_key]["prompt_level_loose_acc"]
-            # MMLU has several metrics but we report just the average one
-            elif task.lower() == "mmlu":
-                value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
-            # HellaSwag and ARC reports acc_norm
-            elif task.lower() in ["hellaswag", "arc"]:
-                value = data["results"][first_result_key]["acc_norm"]
-            # BBH has several metrics but we report just the average one
-            elif task.lower() == "bbh":
-                if "all" in data["results"]:
-                    value = data["results"]["all"]["acc"]
-                else:
-                    value = -100
-            # AGIEval reports acc_norm
-            elif task.lower() == "agieval":
-                value = data["results"]["all"]["acc_norm"]
-            # MATH reports qem
-            elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
-                value = data["results"]["all"]["qem"]
-            # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
-            elif task.lower() in ["mini_math_v2"]:
-                for k, v in data["results"].items():
-                    if k != "all":
-                        level = k.split("|")[1].split(":")[-1]
-                        value = v["qem"]
-                        df.loc[model_revision, f"{task}_{level}"] = value
-            # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
-            elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
-                for k, v in data["results"].items():
-                    if k != "all" and "_average" not in k:
-                        version = k.split("|")[1].split(":")[-1]
-                        value = v["qem"] if "qem" in v else v["score"]
-                        df.loc[model_revision, f"{task}_{version}"] = value
-            # For kaggle_tora we report accuracy as a percentage, so need  to divide by 100
-            elif task.lower() in [
-                "aimo_tora_eval_kaggle_medium",
-                "aimo_tora_eval_kaggle_hard",
-                "aimo_kaggle_fast_eval_hard",
-                "aimo_kaggle_tora_medium",
-                "aimo_kaggle_tora_hard",
-                "aimo_kaggle_tora_medium_extended",
-                "aimo_kaggle_tora_hard_extended",
-                "aimo_math_integer_lvl4",
-                "aimo_math_integer_lvl5",
-            ]:
-                for k, v in data["results"].items():
-                    value = float(v["qem"]) / 100.0
-                    df.loc[model_revision, f"{task}"] = value
-            # For AlpacaEval we report base winrate and lenght corrected one
-            elif task.lower() == "alpaca_eval":
-                value = data["results"][first_result_key]["win_rate"]
-                df.loc[model_revision, "Alpaca_eval"] = value / 100.0
-                value = data["results"][first_result_key]["length_controlled_winrate"]
-                df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
             else:
-                first_metric_key = next(
-                    iter(data["results"][first_result_key])
-                )  # gets the first key in the first result
-                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
-                df.loc[model_revision, task] = float(value)
     # Drop rows where every entry is NaN
     df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])

         with open(filepath, "r") as file:
             data = json.load(file)
             # Skip benchmarks that we don't want to include in the leaderboard
             if task.lower() in BENCHMARKS_TO_SKIP:
                 continue
+            # MixEval doen't have a results key, so we need to get the overall score
+            if task.lower() in ["mixeval", "mixeval_hard"]:
+                value = data["overall score (final score)"]
+                df.loc[model_revision, f"{task}"] = value
             else:
+                first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
+                # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
+                if task.lower() == "truthfulqa":
+                    value = data["results"][first_result_key]["truthfulqa_mc2"]
+                # IFEval has several metrics but we report just the prompt-loose-acc one
+                elif task.lower() == "ifeval":
+                    value = data["results"][first_result_key]["prompt_level_loose_acc"]
+                # MMLU has several metrics but we report just the average one
+                elif task.lower() == "mmlu":
+                    value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
+                # HellaSwag and ARC reports acc_norm
+                elif task.lower() in ["hellaswag", "arc"]:
+                    value = data["results"][first_result_key]["acc_norm"]
+                # BBH has several metrics but we report just the average one
+                elif task.lower() == "bbh":
+                    if "all" in data["results"]:
+                        value = data["results"]["all"]["acc"]
+                    else:
+                        value = -100
+                # AGIEval reports acc_norm
+                elif task.lower() == "agieval":
+                    value = data["results"]["all"]["acc_norm"]
+                # MATH reports qem
+                elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
+                    value = data["results"]["all"]["qem"]
+                # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
+                elif task.lower() in ["mini_math_v2"]:
+                    for k, v in data["results"].items():
+                        if k != "all":
+                            level = k.split("|")[1].split(":")[-1]
+                            value = v["qem"]
+                            df.loc[model_revision, f"{task}_{level}"] = value
+                # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
+                elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
+                    for k, v in data["results"].items():
+                        if k != "all" and "_average" not in k:
+                            version = k.split("|")[1].split(":")[-1]
+                            value = v["qem"] if "qem" in v else v["score"]
+                            df.loc[model_revision, f"{task}_{version}"] = value
+                # For kaggle_tora we report accuracy as a percentage, so need  to divide by 100
+                elif task.lower() in [
+                    "aimo_tora_eval_kaggle_medium",
+                    "aimo_tora_eval_kaggle_hard",
+                    "aimo_kaggle_fast_eval_hard",
+                    "aimo_kaggle_tora_medium",
+                    "aimo_kaggle_tora_hard",
+                    "aimo_kaggle_tora_medium_extended",
+                    "aimo_kaggle_tora_hard_extended",
+                    "aimo_math_integer_lvl4",
+                    "aimo_math_integer_lvl5",
+                ]:
+                    for k, v in data["results"].items():
+                        value = float(v["qem"]) / 100.0
+                        df.loc[model_revision, f"{task}"] = value
+                # For AlpacaEval we report base winrate and lenght corrected one
+                elif task.lower() == "alpaca_eval":
+                    value = data["results"][first_result_key]["win_rate"]
+                    df.loc[model_revision, "Alpaca_eval"] = value / 100.0
+                    value = data["results"][first_result_key]["length_controlled_winrate"]
+                    df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
+                else:
+                    first_metric_key = next(
+                        iter(data["results"][first_result_key])
+                    )  # gets the first key in the first result
+                    value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
+                    df.loc[model_revision, task] = float(value)
     # Drop rows where every entry is NaN
     df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])