File size: 9,892 Bytes
aff26b6 9b14f93 aff26b6 9b14f93 aff26b6 f7bb52c 7ac902b aff26b6 8cc933f aff26b6 a53938b aff26b6 8a31745 599688f 8a31745 599688f aff26b6 7ac902b a17f702 1cfc013 a17f702 f7bb52c f7ee73a a17f702 f7ee73a a17f702 f7bb52c a17f702 f7bb52c a17f702 f7bb52c a17f702 f7bb52c a17f702 f7bb52c a17f702 f7bb52c a17f702 b61cab9 599688f 8a31745 43975e6 cf8d412 599688f 8ea545e 8cc933f b59264d 8cc933f b59264d 8cc933f 8a31745 3a567fa aff26b6 b59264d c9b339f 8cc933f b6e9e9b 8cc933f ab41a5c c9b339f 6e537e5 c9b339f 6e537e5 43975e6 ab41a5c df128e8 aff26b6 c9b339f 8a31745 8ea545e df128e8 8a31745 0521afd df128e8 aff26b6 fcb88b9 5c8755a fcb88b9 c9b339f 8cc933f aff26b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import json
from pathlib import Path
import gradio as gr
import pandas as pd
TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>"""
DESCRIPTION = f"""
Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""
BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]
def get_leaderboard_df():
filepaths = list(Path("eval_results").rglob("*.json"))
# Parse filepaths to get unique models
models = set()
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4])
models.add(model_revision)
# Initialize DataFrame
df = pd.DataFrame(index=list(models))
# Extract data from each file and populate the DataFrame
for filepath in filepaths:
path_parts = Path(filepath).parts
date = filepath.stem.split("_")[-1][:-3]
model_revision = "_".join(path_parts[1:4]) + "_" + date
task = path_parts[4]
df.loc[model_revision, "Date"] = date
with open(filepath, "r") as file:
data = json.load(file)
# Skip benchmarks that we don't want to include in the leaderboard
if task.lower() in BENCHMARKS_TO_SKIP:
continue
# MixEval doen't have a results key, so we need to get the overall score
if task.lower() in ["mixeval", "mixeval_hard"]:
value = data["overall score (final score)"]
df.loc[model_revision, f"{task}"] = value
else:
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
if task.lower() == "truthfulqa":
value = data["results"][first_result_key]["truthfulqa_mc2"]
df.loc[model_revision, task] = float(value)
# IFEval has several metrics but we report the average like Llama3 paper
elif task.lower() == "ifeval":
values = 0.0
for metric in [
"prompt_level_loose",
"prompt_level_strict",
"inst_level_strict",
"inst_level_loose",
]:
values += data["results"][first_result_key][f"{metric}_acc"]
value = values / 4
df.loc[model_revision, f"{task}"] = float(value)
# MMLU has several metrics but we report just the average one
elif task.lower() == "mmlu":
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
df.loc[model_revision, task] = float(value)
# HellaSwag and ARC reports acc_norm
elif task.lower() in ["hellaswag", "arc"]:
value = data["results"][first_result_key]["acc_norm"]
df.loc[model_revision, task] = float(value)
# BBH has several metrics but we report just the average one
elif task.lower() == "bbh":
if "all" in data["results"]:
value = data["results"]["all"]["acc"]
else:
value = -100
df.loc[model_revision, task] = float(value)
# AGIEval reports acc_norm
elif task.lower() == "agieval":
value = data["results"]["all"]["acc_norm"]
df.loc[model_revision, task] = float(value)
# MATH reports qem
elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
value = data["results"]["all"]["qem"]
df.loc[model_revision, task] = float(value)
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
elif task.lower() in ["mini_math_v2"]:
for k, v in data["results"].items():
if k != "all":
level = k.split("|")[1].split(":")[-1]
value = v["qem"]
df.loc[model_revision, f"{task}_{level}"] = value
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
for k, v in data["results"].items():
if k != "all" and "_average" not in k:
version = k.split("|")[1].split(":")[-1]
value = v["qem"] if "qem" in v else v["score"]
df.loc[model_revision, f"{task}_{version}"] = value
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
elif task.lower() in [
"aimo_tora_eval_kaggle_medium",
"aimo_tora_eval_kaggle_hard",
"aimo_kaggle_fast_eval_hard",
"aimo_kaggle_tora_medium",
"aimo_kaggle_tora_hard",
"aimo_kaggle_tora_medium_extended",
"aimo_kaggle_tora_hard_extended",
"aimo_math_integer_lvl4",
"aimo_math_integer_lvl5",
]:
for k, v in data["results"].items():
value = float(v["qem"]) / 100.0
df.loc[model_revision, f"{task}"] = value
# For AlpacaEval we report base winrate and lenght corrected one
elif task.lower() == "alpaca_eval":
value = data["results"][first_result_key]["win_rate"]
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
value = data["results"][first_result_key]["length_controlled_winrate"]
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
else:
first_metric_key = next(
iter(data["results"][first_result_key])
) # gets the first key in the first result
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
df.loc[model_revision, task] = float(value)
# Drop rows where every entry is NaN
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
# Trim minimath column names
df.columns = [c.replace("_level_", "_l") for c in df.columns]
# Trim AIMO column names
df.columns = [c.replace("aimo_", "") for c in df.columns]
df = df.reset_index().rename(columns={"index": "Model"}).round(4)
# Strip off date from model name
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
return df
leaderboard_df = get_leaderboard_df()
def agg_df(df, agg: str = "max"):
df = df.copy()
# Drop date and aggregate results by model name
df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
# Convert all values to percentage
df[df.select_dtypes(include=["number"]).columns] *= 100.0
df = df.sort_values(by=["Average"], ascending=False)
return df
# Function to update the table based on search query
def filter_and_search(cols: list[str], search_query: str, agg: str):
df = leaderboard_df
df = agg_df(df, agg)
if len(search_query) > 0:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
# Drop any columns which are all NaN
df = df.dropna(how="all", axis=1)
if len(cols) > 0:
index_cols = list(leaderboard_df.columns[:1])
new_cols = index_cols + cols
df = df.copy()[new_cols]
# Drop rows with NaN values
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
# Recompute average
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
return df
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
agg = gr.Radio(
["min", "max", "mean"],
value="max",
label="Aggregation",
info="How to aggregate results for each model",
)
with gr.Row():
cols_bar = gr.CheckboxGroup(
choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
show_label=False,
info="Select columns to display",
)
with gr.Group():
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
wrap=True,
height=1000,
column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
)
cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
demo.launch()
|