lewtun HF staff commited on
Commit
f7bb52c
·
1 Parent(s): 03f9140

Fix missing evals

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -10,7 +10,7 @@ DESCRIPTION = f"""
10
  Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
13
- BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5"]
14
 
15
 
16
  def get_leaderboard_df():
@@ -48,27 +48,34 @@ def get_leaderboard_df():
48
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
49
  if task.lower() == "truthfulqa":
50
  value = data["results"][first_result_key]["truthfulqa_mc2"]
 
51
  # IFEval has several metrics but we report just the prompt-loose-acc one
52
  elif task.lower() == "ifeval":
53
  value = data["results"][first_result_key]["prompt_level_loose_acc"]
 
54
  # MMLU has several metrics but we report just the average one
55
  elif task.lower() == "mmlu":
56
  value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
 
57
  # HellaSwag and ARC reports acc_norm
58
  elif task.lower() in ["hellaswag", "arc"]:
59
  value = data["results"][first_result_key]["acc_norm"]
 
60
  # BBH has several metrics but we report just the average one
61
  elif task.lower() == "bbh":
62
  if "all" in data["results"]:
63
  value = data["results"]["all"]["acc"]
64
  else:
65
  value = -100
 
66
  # AGIEval reports acc_norm
67
  elif task.lower() == "agieval":
68
  value = data["results"]["all"]["acc_norm"]
 
69
  # MATH reports qem
70
- elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
71
  value = data["results"]["all"]["qem"]
 
72
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
73
  elif task.lower() in ["mini_math_v2"]:
74
  for k, v in data["results"].items():
 
10
  Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
11
  """
12
 
13
+ BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
14
 
15
 
16
  def get_leaderboard_df():
 
48
  # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
49
  if task.lower() == "truthfulqa":
50
  value = data["results"][first_result_key]["truthfulqa_mc2"]
51
+ df.loc[model_revision, task] = float(value)
52
  # IFEval has several metrics but we report just the prompt-loose-acc one
53
  elif task.lower() == "ifeval":
54
  value = data["results"][first_result_key]["prompt_level_loose_acc"]
55
+ df.loc[model_revision, task] = float(value)
56
  # MMLU has several metrics but we report just the average one
57
  elif task.lower() == "mmlu":
58
  value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
59
+ df.loc[model_revision, task] = float(value)
60
  # HellaSwag and ARC reports acc_norm
61
  elif task.lower() in ["hellaswag", "arc"]:
62
  value = data["results"][first_result_key]["acc_norm"]
63
+ df.loc[model_revision, task] = float(value)
64
  # BBH has several metrics but we report just the average one
65
  elif task.lower() == "bbh":
66
  if "all" in data["results"]:
67
  value = data["results"]["all"]["acc"]
68
  else:
69
  value = -100
70
+ df.loc[model_revision, task] = float(value)
71
  # AGIEval reports acc_norm
72
  elif task.lower() == "agieval":
73
  value = data["results"]["all"]["acc_norm"]
74
+ df.loc[model_revision, task] = float(value)
75
  # MATH reports qem
76
+ elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
77
  value = data["results"]["all"]["qem"]
78
+ df.loc[model_revision, task] = float(value)
79
  # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
80
  elif task.lower() in ["mini_math_v2"]:
81
  for k, v in data["results"].items():