import json
from pathlib import Path
import gradio as gr
import pandas as pd
TITLE = """
LLM Leaderboard for H4 Models
"""
DESCRIPTION = f"""
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
"""
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
def get_leaderboard_df(agg: str = "max"):
filepaths = list(Path("eval_results").rglob("*.json"))
# Parse filepaths to get unique models
models = set()
for filepath in filepaths:
path_parts = Path(filepath).parts
model_revision = "_".join(path_parts[1:4])
models.add(model_revision)
# Initialize DataFrame
df = pd.DataFrame(index=list(models))
# Extract data from each file and populate the DataFrame
for filepath in filepaths:
path_parts = Path(filepath).parts
date = filepath.stem.split("_")[-1][:-3]
model_revision = "_".join(path_parts[1:4]) + "_" + date
task = path_parts[4]
df.loc[model_revision, "Date"] = date
with open(filepath, "r") as file:
data = json.load(file)
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
# Skip benchmarks that we don't want to include in the leaderboard
if task.lower() in BENCHMARKS_TO_SKIP:
continue
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
if task.lower() == "truthfulqa":
value = data["results"][first_result_key]["truthfulqa_mc2"]
# IFEval has several metrics but we report just the prompt-loose-acc one
elif task.lower() == "ifeval":
value = data["results"][first_result_key]["prompt_level_loose_acc"]
# MMLU has several metrics but we report just the average one
elif task.lower() == "mmlu":
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
# HellaSwag and ARC reports acc_norm
elif task.lower() in ["hellaswag", "arc"]:
value = data["results"][first_result_key]["acc_norm"]
# BBH has several metrics but we report just the average one
elif task.lower() == "bbh":
if "all" in data["results"]:
value = data["results"]["all"]["acc"]
else:
value = -100
# AGIEval reports acc_norm
elif task.lower() == "agieval":
value = data["results"]["all"]["acc_norm"]
# MATH reports qem
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
value = data["results"]["all"]["qem"]
else:
first_metric_key = next(
iter(data["results"][first_result_key])
) # gets the first key in the first result
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
if task.lower() in ["mini_math_v2"]:
for k, v in data["results"].items():
if k != "all":
level = k.split("|")[1].split(":")[-1]
value = v["qem"]
df.loc[model_revision, f"{task}_{level}"] = value
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
elif task.lower() in ["aimo_kaggle_medium_pot"]:
for k, v in data["results"].items():
if k != "all" and "_average" not in k:
version = k.split("|")[1].split(":")[-1]
value = v["qem"] if "qem" in v else v["score"]
df.loc[model_revision, f"{task}_{version}"] = value
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
elif task.lower() in ["aimo_kaggle_hard_pot"]:
for k, v in data["results"].items():
if k != "all" and "_average" not in k:
version = k.split("|")[1].split(":")[-1]
value = v["qem"] if "qem" in v else v["score"]
df.loc[model_revision, f"{task}_{version}"] = value
# For kaggle_tora we report accuracy, so need to divide by 100
elif task.lower() in [
"aimo_tora_eval_kaggle_medium",
"aimo_tora_eval_kaggle_hard",
"aimo_kaggle_fast_eval_hard",
"aimo_kaggle_tora_medium",
"aimo_kaggle_tora_hard",
"aimo_kaggle_tora_medium_extended",
"aimo_kaggle_tora_hard_extended",
]:
for k, v in data["results"].items():
value = float(v["qem"]) / 100.0
df.loc[model_revision, f"{task}"] = value
# For AlpacaEval we report base winrate and lenght corrected one
elif task.lower() == "alpaca_eval":
value = data["results"][first_result_key]["win_rate"]
df.loc[model_revision, "Alpaca_eval"] = value / 100.0
value = data["results"][first_result_key]["length_controlled_winrate"]
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
else:
df.loc[model_revision, task] = float(value)
# Drop rows where every entry is NaN
df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
# Trim minimath column names
df.columns = [c.replace("_level_", "_l") for c in df.columns]
# Trim AIMO column names
df.columns = [c.replace("aimo_", "") for c in df.columns]
df.insert(loc=0, column="Average", value=df.mean(axis=1, numeric_only=True))
# Convert all values to percentage
df[df.select_dtypes(include=["number"]).columns] *= 100.0
df = df.sort_values(by=["Average"], ascending=False)
df = df.reset_index().rename(columns={"index": "Model"}).round(2)
# Strip off date from model name
df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
# Drop date and aggregate results by model name
df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()
return df
def refresh(agg: str = "max"):
return get_leaderboard_df(agg=agg)
# Function to update the table based on search query
def update_table(search_query, agg):
df = get_leaderboard_df(agg)
if search_query:
search_terms = search_query.split(";")
search_terms = [term.strip().lower() for term in search_terms]
pattern = "|".join(search_terms)
df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
# Drop any columns which are all NaN
df = df.dropna(how="all", axis=1)
return df
def filter_columns(cols):
index_cols = list(leaderboard_df.columns[:1])
new_cols = index_cols + cols
df = get_leaderboard_df()
df = df.copy()[new_cols]
# Drop rows with NaN values
df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
# Recompute average
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
return df
leaderboard_df = get_leaderboard_df()
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
with gr.Column():
gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
with gr.Row():
search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
agg = gr.Radio(
["min", "max", "mean"],
value="max",
label="Aggregation",
info="How to aggregate results for each model",
)
with gr.Row():
cols_bar = gr.CheckboxGroup(
choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
show_label=False,
info="Select columns to display",
)
with gr.Group():
leaderboard_df = get_leaderboard_df()
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
wrap=True,
height=1000,
column_widths=[400, 110] + [(220 + len(c)) for c in leaderboard_df.columns[1:]],
)
with gr.Row():
refresh_button = gr.Button("Refresh")
cols_bar.change(filter_columns, inputs=[cols_bar], outputs=[leaderboard_table])
agg.change(refresh, inputs=[agg], outputs=[leaderboard_table])
search_bar.submit(update_table, inputs=[search_bar, agg], outputs=[leaderboard_table])
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
demo.launch()