lewtun HF staff commited on
Commit
152be83
·
1 Parent(s): b16fbb9

Fix metrics

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -37,11 +37,17 @@ def get_leaderboard_df():
37
  data = json.load(file)
38
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
39
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
40
- if task == "truthfulqa":
41
  value = data["results"][first_result_key]["truthfulqa_mc2"]
42
  # IFEval has several metrics but we report just the prompt-loose-acc one
43
- elif task == "ifeval":
44
  value = data["results"][first_result_key]["prompt_level_loose_acc"]
 
 
 
 
 
 
45
  else:
46
  first_metric_key = next(
47
  iter(data["results"][first_result_key])
 
37
  data = json.load(file)
38
  first_result_key = next(iter(data["results"])) # gets the first key in 'results'
39
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
40
+ if task.lower() == "truthfulqa":
41
  value = data["results"][first_result_key]["truthfulqa_mc2"]
42
  # IFEval has several metrics but we report just the prompt-loose-acc one
43
+ elif task.lower() == "ifeval":
44
  value = data["results"][first_result_key]["prompt_level_loose_acc"]
45
+ # MMLU has several metrics but we report just the average one
46
+ elif task.lower() == "mmlu":
47
+ value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
48
+ # HellaSwag and ARC reports acc_norm
49
+ elif task.lower() in ["hellaswag", "arc"]:
50
+ value = data["results"][first_result_key]["acc_norm"]
51
  else:
52
  first_metric_key = next(
53
  iter(data["results"][first_result_key])