lewtun HF staff commited on
Commit
a17f702
·
1 Parent(s): c809f5c

Fix MixEval

Browse files
Files changed (1) hide show
  1. app.py +70 -65
app.py CHANGED
@@ -36,75 +36,80 @@ def get_leaderboard_df():
36
 
37
  with open(filepath, "r") as file:
38
  data = json.load(file)
39
- first_result_key = next(iter(data["results"])) # gets the first key in 'results'
40
  # Skip benchmarks that we don't want to include in the leaderboard
41
  if task.lower() in BENCHMARKS_TO_SKIP:
42
  continue
43
- # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
44
- if task.lower() == "truthfulqa":
45
- value = data["results"][first_result_key]["truthfulqa_mc2"]
46
- # IFEval has several metrics but we report just the prompt-loose-acc one
47
- elif task.lower() == "ifeval":
48
- value = data["results"][first_result_key]["prompt_level_loose_acc"]
49
- # MMLU has several metrics but we report just the average one
50
- elif task.lower() == "mmlu":
51
- value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
52
- # HellaSwag and ARC reports acc_norm
53
- elif task.lower() in ["hellaswag", "arc"]:
54
- value = data["results"][first_result_key]["acc_norm"]
55
- # BBH has several metrics but we report just the average one
56
- elif task.lower() == "bbh":
57
- if "all" in data["results"]:
58
- value = data["results"]["all"]["acc"]
59
- else:
60
- value = -100
61
- # AGIEval reports acc_norm
62
- elif task.lower() == "agieval":
63
- value = data["results"]["all"]["acc_norm"]
64
- # MATH reports qem
65
- elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
66
- value = data["results"]["all"]["qem"]
67
- # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
68
- elif task.lower() in ["mini_math_v2"]:
69
- for k, v in data["results"].items():
70
- if k != "all":
71
- level = k.split("|")[1].split(":")[-1]
72
- value = v["qem"]
73
- df.loc[model_revision, f"{task}_{level}"] = value
74
- # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
75
- elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
76
- for k, v in data["results"].items():
77
- if k != "all" and "_average" not in k:
78
- version = k.split("|")[1].split(":")[-1]
79
- value = v["qem"] if "qem" in v else v["score"]
80
- df.loc[model_revision, f"{task}_{version}"] = value
81
- # For kaggle_tora we report accuracy as a percentage, so need to divide by 100
82
- elif task.lower() in [
83
- "aimo_tora_eval_kaggle_medium",
84
- "aimo_tora_eval_kaggle_hard",
85
- "aimo_kaggle_fast_eval_hard",
86
- "aimo_kaggle_tora_medium",
87
- "aimo_kaggle_tora_hard",
88
- "aimo_kaggle_tora_medium_extended",
89
- "aimo_kaggle_tora_hard_extended",
90
- "aimo_math_integer_lvl4",
91
- "aimo_math_integer_lvl5",
92
- ]:
93
- for k, v in data["results"].items():
94
- value = float(v["qem"]) / 100.0
95
- df.loc[model_revision, f"{task}"] = value
96
- # For AlpacaEval we report base winrate and lenght corrected one
97
- elif task.lower() == "alpaca_eval":
98
- value = data["results"][first_result_key]["win_rate"]
99
- df.loc[model_revision, "Alpaca_eval"] = value / 100.0
100
- value = data["results"][first_result_key]["length_controlled_winrate"]
101
- df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
102
  else:
103
- first_metric_key = next(
104
- iter(data["results"][first_result_key])
105
- ) # gets the first key in the first result
106
- value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
107
- df.loc[model_revision, task] = float(value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  # Drop rows where every entry is NaN
110
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
 
36
 
37
  with open(filepath, "r") as file:
38
  data = json.load(file)
 
39
  # Skip benchmarks that we don't want to include in the leaderboard
40
  if task.lower() in BENCHMARKS_TO_SKIP:
41
  continue
42
+ # MixEval doen't have a results key, so we need to get the overall score
43
+ if task.lower() in ["mixeval", "mixeval_hard"]:
44
+ value = data["overall score (final score)"]
45
+ df.loc[model_revision, f"{task}"] = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  else:
47
+ first_result_key = next(iter(data["results"])) # gets the first key in 'results'
48
+ # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
49
+ if task.lower() == "truthfulqa":
50
+ value = data["results"][first_result_key]["truthfulqa_mc2"]
51
+ # IFEval has several metrics but we report just the prompt-loose-acc one
52
+ elif task.lower() == "ifeval":
53
+ value = data["results"][first_result_key]["prompt_level_loose_acc"]
54
+ # MMLU has several metrics but we report just the average one
55
+ elif task.lower() == "mmlu":
56
+ value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
57
+ # HellaSwag and ARC reports acc_norm
58
+ elif task.lower() in ["hellaswag", "arc"]:
59
+ value = data["results"][first_result_key]["acc_norm"]
60
+ # BBH has several metrics but we report just the average one
61
+ elif task.lower() == "bbh":
62
+ if "all" in data["results"]:
63
+ value = data["results"]["all"]["acc"]
64
+ else:
65
+ value = -100
66
+ # AGIEval reports acc_norm
67
+ elif task.lower() == "agieval":
68
+ value = data["results"]["all"]["acc_norm"]
69
+ # MATH reports qem
70
+ elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
71
+ value = data["results"]["all"]["qem"]
72
+ # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
73
+ elif task.lower() in ["mini_math_v2"]:
74
+ for k, v in data["results"].items():
75
+ if k != "all":
76
+ level = k.split("|")[1].split(":")[-1]
77
+ value = v["qem"]
78
+ df.loc[model_revision, f"{task}_{level}"] = value
79
+ # For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
80
+ elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
81
+ for k, v in data["results"].items():
82
+ if k != "all" and "_average" not in k:
83
+ version = k.split("|")[1].split(":")[-1]
84
+ value = v["qem"] if "qem" in v else v["score"]
85
+ df.loc[model_revision, f"{task}_{version}"] = value
86
+ # For kaggle_tora we report accuracy as a percentage, so need to divide by 100
87
+ elif task.lower() in [
88
+ "aimo_tora_eval_kaggle_medium",
89
+ "aimo_tora_eval_kaggle_hard",
90
+ "aimo_kaggle_fast_eval_hard",
91
+ "aimo_kaggle_tora_medium",
92
+ "aimo_kaggle_tora_hard",
93
+ "aimo_kaggle_tora_medium_extended",
94
+ "aimo_kaggle_tora_hard_extended",
95
+ "aimo_math_integer_lvl4",
96
+ "aimo_math_integer_lvl5",
97
+ ]:
98
+ for k, v in data["results"].items():
99
+ value = float(v["qem"]) / 100.0
100
+ df.loc[model_revision, f"{task}"] = value
101
+ # For AlpacaEval we report base winrate and lenght corrected one
102
+ elif task.lower() == "alpaca_eval":
103
+ value = data["results"][first_result_key]["win_rate"]
104
+ df.loc[model_revision, "Alpaca_eval"] = value / 100.0
105
+ value = data["results"][first_result_key]["length_controlled_winrate"]
106
+ df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
107
+ else:
108
+ first_metric_key = next(
109
+ iter(data["results"][first_result_key])
110
+ ) # gets the first key in the first result
111
+ value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
112
+ df.loc[model_revision, task] = float(value)
113
 
114
  # Drop rows where every entry is NaN
115
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])