|
"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" |
|
|
|
import ast |
|
import argparse |
|
import glob |
|
import pickle |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import pandas as pd |
|
|
|
|
|
|
|
MODEL_NAME_COST = { |
|
"anthropic/claude-2.1": 8, |
|
"anthropic/claude-3-haiku": 0.25, |
|
"anthropic/claude-3-opus": 15, |
|
"anthropic/claude-3-sonnet": 3, |
|
"cohere/command-r": 0.5, |
|
"google/gemini-pro": 0.12, |
|
"google/gemma-7b-it": 0.1, |
|
"mistralai/mistral-large": 8, |
|
"mistralai/mistral-medium": 2.7, |
|
"mistralai/mixtral-8x7b-instruct": 0.7, |
|
"openai/gpt-3.5-turbo": 0.5, |
|
"openai/gpt-4-1106-preview": 10, |
|
} |
|
|
|
|
|
def make_default_md(): |
|
|
|
leaderboard_md = f""" |
|
# ๐ CZ-EVAL Leaderboard |
|
[Developer](https://me.hynky.name/) | [Twitter](https://twitter.com/HKydlicek) |
|
|
|
CZ-EVAL is a evaluation leadboard of Tasks in Czech for LLMs. |
|
|
|
It's evaluated on following datasets: |
|
|
|
- Math Problems Understanding [Klokan-QA](https://huggingface.co./datasets/hynky/klokan-qa) |
|
- Reasoning and General Knowledge [TSP-QA](https://huggingface.co./datasets/hynky/tsp-qa) |
|
|
|
๐ป Code: The evaluation code can be found at [hynky1999/LLM-Eval](https://github.com/hynky1999/LLM-Eval). Model inference is done using [Open-Router](https://openrouter.ai/) or on cloud using [Modal Labs](https://modal.com/). |
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def make_arena_leaderboard_md(arena_df): |
|
total_models = len(arena_df) |
|
|
|
leaderboard_md = f""" |
|
Total #models: **{total_models}**. Last updated: Mar 17, 2024. |
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def make_full_leaderboard_md(elo_results): |
|
leaderboard_md = f""" |
|
Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**. |
|
- [Klokan-QA](https://huggingface.co./datasets/hynky/klokan-qa) - Mathematical competitions dataset |
|
- [TSP](https://huggingface.co./datasets/hynky/TSP) - Comprehensive dataset of |
|
|
|
""" |
|
return leaderboard_md |
|
|
|
|
|
|
|
|
|
|
|
def plot_spider(df, title): |
|
categories = df.columns.tolist()[1:] |
|
categories = [ |
|
*categories, |
|
categories[0], |
|
] |
|
colors = [ |
|
'#1f77b4', |
|
'#ff7f0e', |
|
'#2ca02c', |
|
'#d62728', |
|
'#9467bd', |
|
'#8c564b', |
|
'#e377c2', |
|
'#7f7f7f', |
|
'#bcbd22', |
|
'#17becf', |
|
'#f7b6d2', |
|
'#bcbd22', |
|
'#dbdb8d', |
|
'#17becf', |
|
'#9edae5', |
|
'#c5b0d5', |
|
'#c49c94', |
|
'#f7b6d2', |
|
'#bcbd22', |
|
'#dbdb8d', |
|
'#17becf', |
|
'#9edae5', |
|
'#c5b0d5', |
|
'#c49c94', |
|
] |
|
|
|
|
|
fig_1000 = go.Figure() |
|
|
|
for i, (idx, row) in enumerate(df.iterrows()): |
|
name = row[0] |
|
row = row.tolist()[1:] |
|
row = row + [ |
|
row[0] |
|
] |
|
color = colors[i] |
|
fig_1000.add_trace( |
|
go.Scatterpolar( |
|
r=row, |
|
theta=categories, |
|
opacity=0.4, |
|
name=name, |
|
line=dict( |
|
color=color, width=4 |
|
), |
|
) |
|
) |
|
|
|
fig_1000.update_layout( |
|
width=600, |
|
height=950, |
|
polar=dict( |
|
angularaxis=dict( |
|
gridwidth=2, |
|
rotation=90, |
|
direction="clockwise", |
|
), |
|
radialaxis=dict( |
|
visible=True, |
|
range=[0, 100], |
|
angle=45, |
|
tickangle=45, |
|
tickvals=[0, 25, 50, 75, 100], |
|
ticktext=["0%", "25%", "50%", "75%", "100%"], |
|
), |
|
), |
|
title_text=title, |
|
title_x=0.5, |
|
title_y=0.97, |
|
title_xanchor="center", |
|
title_yanchor="top", |
|
title_font_size=24, |
|
title_font_color="#333333", |
|
font=dict(family="Arial", size=16, color="#333333"), |
|
legend=dict( |
|
orientation="h", yanchor="bottom", y=-0.45, xanchor="center", x=0.5 |
|
), |
|
) |
|
return fig_1000 |
|
|
|
|
|
def openrouter_hyperlink(model_name): |
|
return f'<a target="_blank" href="https://openrouter.ai/models/{model_name}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
|
|
def get_full_table(model_table_df): |
|
num_cols = ["klokan", "culture", "analytical", "critical", "verbal"] |
|
|
|
|
|
model_table_df["average"] = model_table_df[num_cols].mean(axis=1) |
|
model_table_df[num_cols + ["average"]] = model_table_df[ |
|
num_cols + ["average"] |
|
].apply(lambda x: round(x * 100, 2)) |
|
|
|
|
|
model_table_df.sort_values(by="average", ascending=False, inplace=True) |
|
model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1)) |
|
|
|
|
|
model_table_df["completion_price"] = model_table_df["model_name"].apply( |
|
lambda x: f"{MODEL_NAME_COST[x]}$" |
|
) |
|
|
|
|
|
model_table_df["model_name"] = model_table_df["model_name"].apply( |
|
lambda x: openrouter_hyperlink(x) |
|
) |
|
|
|
|
|
model_table_df = model_table_df[["rank", "model_name", "completion_price", "klokan", "culture", "analytical", "critical", "verbal", "average"]] |
|
|
|
model_table_df.rename( |
|
columns={ |
|
"model_name": "๐ค Model", |
|
"completion_price": "๐ฐ Cost (1M-Tokens)", |
|
"klokan": "๐งฎ Klokan-QA", |
|
"culture": "๐ TSP-Culture", |
|
"analytical": "๐ TSP-Analytical", |
|
"critical": "๐ก TSP-Critical", |
|
"verbal": "๐ TSP-Verbal", |
|
"average": "๐ Average", |
|
}, |
|
inplace=True, |
|
) |
|
|
|
|
|
return model_table_df |
|
|
|
|
|
def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_file): |
|
|
|
results = pd.read_csv(leaderboard_table_file) |
|
results = get_full_table(results) |
|
|
|
default_md = make_default_md() |
|
|
|
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") |
|
with gr.Tabs() as tabs: |
|
|
|
with gr.Tab("CZ-EVAL Leaderboard", id=0): |
|
md = make_arena_leaderboard_md(results) |
|
gr.Markdown(md, elem_id="leaderboard_markdown") |
|
gr.Dataframe( |
|
datatype=[ |
|
"str", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"str", |
|
"str", |
|
"str", |
|
], |
|
value=results, |
|
elem_id="arena_leaderboard_dataframe", |
|
height=700, |
|
column_widths=[ |
|
70, |
|
200, |
|
110, |
|
120, |
|
120, |
|
120, |
|
120, |
|
100, |
|
100, |
|
], |
|
wrap=True, |
|
) |
|
|
|
p1 = plot_spider(pd.read_csv(klokan_table_file), "Klokan-QA - Acurracy") |
|
p2 = plot_spider(pd.read_csv(tsp_table_file), "TSP - Accuracy") |
|
|
|
gr.Markdown( |
|
f"""## More Statistics for CZ-EVAL\n |
|
Below are figures for more statistics. |
|
""", |
|
elem_id="leaderboard_markdown", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 1: Performance of models on Klokan-QA per difficulty" |
|
) |
|
plot_1 = gr.Plot(p1, show_label=False) |
|
with gr.Column(): |
|
gr.Markdown("#### Figure 2: Performance of models on TSP dataset") |
|
plot_2 = gr.Plot(p2, show_label=False) |
|
|
|
return [md_1, plot_1, plot_2] |
|
|
|
|
|
block_css = """ |
|
#notice_markdown { |
|
font-size: 104% |
|
} |
|
#notice_markdown th { |
|
display: none; |
|
} |
|
#notice_markdown td { |
|
padding-top: 6px; |
|
padding-bottom: 6px; |
|
} |
|
#leaderboard_markdown { |
|
font-size: 104% |
|
} |
|
#leaderboard_markdown td { |
|
padding-top: 6px; |
|
padding-bottom: 6px; |
|
} |
|
#leaderboard_dataframe td { |
|
line-height: 0.1em; |
|
} |
|
footer { |
|
display:none !important |
|
} |
|
.image-container { |
|
display: flex; |
|
align-items: center; |
|
padding: 1px; |
|
} |
|
.image-container img { |
|
margin: 0 30px; |
|
height: 20px; |
|
max-height: 100%; |
|
width: auto; |
|
max-width: 20%; |
|
} |
|
""" |
|
|
|
|
|
def build_demo(leadboard_table, klokan_table, tsp_table): |
|
text_size = gr.themes.sizes.text_lg |
|
|
|
with gr.Blocks( |
|
title="CZ-EVAL Leaderboard", |
|
theme=gr.themes.Base(text_size=text_size), |
|
css=block_css, |
|
) as demo: |
|
leader_components = build_leaderboard_tab( |
|
leadboard_table, klokan_table, tsp_table |
|
) |
|
return demo |
|
|
|
|
|
demo = build_demo( |
|
leadboard_table="./leaderboard/table.csv", |
|
klokan_table="./leaderboard/klokan.csv", |
|
tsp_table="./leaderboard/tsp.csv", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|