open-r1-eval-leaderboard

Running

lewtun HF staff commited on Mar 4, 2024

Commit

152be83

1 Parent(s): b16fbb9

Fix metrics

Files changed (1) hide show

app.py CHANGED Viewed

@@ -37,11 +37,17 @@ def get_leaderboard_df():
             data = json.load(file)
             first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
             # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
-            if task == "truthfulqa":
                 value = data["results"][first_result_key]["truthfulqa_mc2"]
             # IFEval has several metrics but we report just the prompt-loose-acc one
-            elif task == "ifeval":
                 value = data["results"][first_result_key]["prompt_level_loose_acc"]
             else:
                 first_metric_key = next(
                     iter(data["results"][first_result_key])

             data = json.load(file)
             first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
             # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
+            if task.lower() == "truthfulqa":
                 value = data["results"][first_result_key]["truthfulqa_mc2"]
             # IFEval has several metrics but we report just the prompt-loose-acc one
+            elif task.lower() == "ifeval":
                 value = data["results"][first_result_key]["prompt_level_loose_acc"]
+            # MMLU has several metrics but we report just the average one
+            elif task.lower() == "mmlu":
+                value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
+            # HellaSwag and ARC reports acc_norm
+            elif task.lower() in ["hellaswag", "arc"]:
+                value = data["results"][first_result_key]["acc_norm"]
             else:
                 first_metric_key = next(
                     iter(data["results"][first_result_key])