Fix metrics
Browse files
app.py
CHANGED
@@ -37,11 +37,17 @@ def get_leaderboard_df():
|
|
37 |
data = json.load(file)
|
38 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
40 |
-
if task == "truthfulqa":
|
41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
42 |
# IFEval has several metrics but we report just the prompt-loose-acc one
|
43 |
-
elif task == "ifeval":
|
44 |
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
else:
|
46 |
first_metric_key = next(
|
47 |
iter(data["results"][first_result_key])
|
|
|
37 |
data = json.load(file)
|
38 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
40 |
+
if task.lower() == "truthfulqa":
|
41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
42 |
# IFEval has several metrics but we report just the prompt-loose-acc one
|
43 |
+
elif task.lower() == "ifeval":
|
44 |
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
45 |
+
# MMLU has several metrics but we report just the average one
|
46 |
+
elif task.lower() == "mmlu":
|
47 |
+
value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
|
48 |
+
# HellaSwag and ARC reports acc_norm
|
49 |
+
elif task.lower() in ["hellaswag", "arc"]:
|
50 |
+
value = data["results"][first_result_key]["acc_norm"]
|
51 |
else:
|
52 |
first_metric_key = next(
|
53 |
iter(data["results"][first_result_key])
|