Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from huggingface_hub import HfApi, ModelCard
|
4 |
+
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
5 |
+
import re
|
6 |
+
from io import StringIO
|
7 |
+
from yall import create_yall
|
8 |
+
import plotly.graph_objs as go
|
9 |
+
|
10 |
+
def calculate_pages(df, items_per_page):
|
11 |
+
"""Calculate the number of pages needed for pagination."""
|
12 |
+
return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
|
13 |
+
|
14 |
+
@st.cache_data
|
15 |
+
def cached_model_info(_api, model):
|
16 |
+
"""Fetch model information from the Hugging Face API and cache the result."""
|
17 |
+
try:
|
18 |
+
return _api.model_info(repo_id=str(model))
|
19 |
+
except (RepositoryNotFoundError, RevisionNotFoundError):
|
20 |
+
return None
|
21 |
+
|
22 |
+
@st.cache_data
|
23 |
+
def get_model_info(df):
|
24 |
+
"""Get model information and update the DataFrame with likes and tags."""
|
25 |
+
api = HfApi()
|
26 |
+
with st.spinner("Fetching model information..."):
|
27 |
+
for index, row in df.iterrows():
|
28 |
+
model_info = cached_model_info(api, row['Model'].strip())
|
29 |
+
if model_info:
|
30 |
+
df.loc[index, 'Likes'] = model_info.likes
|
31 |
+
df.loc[index, 'Tags'] = ', '.join(model_info.tags)
|
32 |
+
else:
|
33 |
+
df.loc[index, 'Likes'] = -1
|
34 |
+
df.loc[index, 'Tags'] = ''
|
35 |
+
return df
|
36 |
+
|
37 |
+
def convert_markdown_table_to_dataframe(md_content):
|
38 |
+
"""Convert a markdown table to a pandas DataFrame."""
|
39 |
+
cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
|
40 |
+
df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
|
41 |
+
df = df.drop(0, axis=0)
|
42 |
+
df.columns = df.columns.str.strip()
|
43 |
+
model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
|
44 |
+
df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
|
45 |
+
df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
|
46 |
+
return df
|
47 |
+
|
48 |
+
def create_bar_chart(df, category):
|
49 |
+
"""Create a horizontal bar chart for the specified category."""
|
50 |
+
st.write(f"### {category} Scores")
|
51 |
+
sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
|
52 |
+
fig = go.Figure(go.Bar(
|
53 |
+
x=sorted_df[category],
|
54 |
+
y=sorted_df['Model'],
|
55 |
+
orientation='h',
|
56 |
+
marker=dict(color=sorted_df[category], colorscale='Viridis'),
|
57 |
+
hoverinfo='x+y',
|
58 |
+
text=sorted_df[category],
|
59 |
+
textposition='auto'
|
60 |
+
))
|
61 |
+
fig.update_layout(
|
62 |
+
margin=dict(l=20, r=20, t=20, b=20),
|
63 |
+
title=f"Leaderboard for {category} Scores"
|
64 |
+
)
|
65 |
+
st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
|
66 |
+
|
67 |
+
def fetch_merge_configs(df):
|
68 |
+
"""Fetch and save merge configurations for the top models."""
|
69 |
+
df_sorted = df.sort_values(by='Average', ascending=False)
|
70 |
+
try:
|
71 |
+
with open('/tmp/configurations.txt', 'a') as file:
|
72 |
+
for index, row in df_sorted.head(20).iterrows():
|
73 |
+
model_name = row['Model'].rstrip()
|
74 |
+
try:
|
75 |
+
card = ModelCard.load(model_name)
|
76 |
+
file.write(f'Model Name: {model_name}\n')
|
77 |
+
file.write(f'Scores: {row["Average"]}\n')
|
78 |
+
file.write(f'AGIEval: {row["AGIEval"]}\n')
|
79 |
+
file.write(f'GPT4All: {row["GPT4All"]}\n')
|
80 |
+
file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
|
81 |
+
file.write(f'Bigbench: {row["Bigbench"]}\n')
|
82 |
+
file.write(f'Model Card: {card}\n')
|
83 |
+
except Exception as e:
|
84 |
+
st.error(f"Error loading model card for {model_name}: {str(e)}")
|
85 |
+
with open('/tmp/configurations.txt', 'r') as file:
|
86 |
+
content = file.read()
|
87 |
+
matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
|
88 |
+
with open('/tmp/configurations2.txt', 'w') as file:
|
89 |
+
for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
|
90 |
+
file.write(f'Model Name: {row[0]}\n')
|
91 |
+
file.write(f'Scores: {row[1]}\n')
|
92 |
+
file.write(f'AGIEval: {row[2]}\n')
|
93 |
+
file.write(f'GPT4All: {row[3]}\n')
|
94 |
+
file.write(f'TruthfulQA: {row[4]}\n')
|
95 |
+
file.write(f'Bigbench: {row[5]}\n')
|
96 |
+
file.write('yaml' + match + '```\n')
|
97 |
+
except Exception as e:
|
98 |
+
st.error(f"Error while fetching merge configs: {str(e)}")
|
99 |
+
|
100 |
+
def main():
|
101 |
+
"""Main function to set up the Streamlit app and display the leaderboard."""
|
102 |
+
st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
|
103 |
+
st.title("π YALL - Yet Another LLM Leaderboard")
|
104 |
+
st.markdown("Leaderboard made with π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
|
105 |
+
content = create_yall()
|
106 |
+
tab1, tab2 = st.tabs(["π Leaderboard", "π About"])
|
107 |
+
|
108 |
+
with tab1:
|
109 |
+
if content:
|
110 |
+
try:
|
111 |
+
score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
|
112 |
+
full_df = convert_markdown_table_to_dataframe(content)
|
113 |
+
|
114 |
+
for col in score_columns:
|
115 |
+
full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
|
116 |
+
|
117 |
+
full_df = get_model_info(full_df)
|
118 |
+
full_df['Tags'] = full_df['Tags'].fillna('')
|
119 |
+
df = pd.DataFrame(columns=full_df.columns)
|
120 |
+
|
121 |
+
show_phi = st.checkbox("Phi (2.8B)", value=True)
|
122 |
+
show_mistral = st.checkbox("Mistral (7B)", value=True)
|
123 |
+
show_other = st.checkbox("Other", value=True)
|
124 |
+
|
125 |
+
dfs_to_concat = []
|
126 |
+
if show_phi:
|
127 |
+
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
|
128 |
+
if show_mistral:
|
129 |
+
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
|
130 |
+
if show_other:
|
131 |
+
other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
|
132 |
+
dfs_to_concat.append(other_df)
|
133 |
+
|
134 |
+
if dfs_to_concat:
|
135 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
136 |
+
|
137 |
+
search_query = st.text_input("Search models", "")
|
138 |
+
if search_query:
|
139 |
+
df = df[df['Model'].str.contains(search_query, case=False)]
|
140 |
+
|
141 |
+
items_per_page = 50
|
142 |
+
pages = calculate_pages(df, items_per_page)
|
143 |
+
page = st.selectbox("Page", list(range(1, pages + 1)))
|
144 |
+
|
145 |
+
df = df.sort_values(by='Average', ascending=False)
|
146 |
+
start = (page - 1) * items_per_page
|
147 |
+
end = start + items_per_page
|
148 |
+
df = df[start:end]
|
149 |
+
|
150 |
+
selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
|
151 |
+
|
152 |
+
if selected_benchmarks:
|
153 |
+
df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
|
154 |
+
df = df.sort_values(by='Filtered Average', ascending=False)
|
155 |
+
st.dataframe(
|
156 |
+
df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
|
157 |
+
use_container_width=True,
|
158 |
+
column_config={
|
159 |
+
"Likes": st.column_config.NumberColumn(
|
160 |
+
"Likes",
|
161 |
+
help="Number of likes on Hugging Face",
|
162 |
+
format="%d β€οΈ",
|
163 |
+
),
|
164 |
+
"URL": st.column_config.LinkColumn("URL"),
|
165 |
+
},
|
166 |
+
hide_index=True,
|
167 |
+
height=len(df) * 37,
|
168 |
+
)
|
169 |
+
|
170 |
+
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
|
171 |
+
comparison_df = df[df['Model'].isin(selected_models)]
|
172 |
+
st.dataframe(comparison_df)
|
173 |
+
|
174 |
+
if st.button("Export to CSV"):
|
175 |
+
csv_data = df.to_csv(index=False)
|
176 |
+
st.download_button(
|
177 |
+
label="Download CSV",
|
178 |
+
data=csv_data,
|
179 |
+
file_name="leaderboard.csv",
|
180 |
+
key="download-csv",
|
181 |
+
help="Click to download the CSV file",
|
182 |
+
)
|
183 |
+
if st.button("Fetch Merge-Configs"):
|
184 |
+
fetch_merge_configs(full_df)
|
185 |
+
st.success("Merge configurations have been fetched and saved.")
|
186 |
+
|
187 |
+
create_bar_chart(df, 'Filtered Average')
|
188 |
+
|
189 |
+
col1, col2 = st.columns(2)
|
190 |
+
with col1:
|
191 |
+
create_bar_chart(df, score_columns[1])
|
192 |
+
with col2:
|
193 |
+
create_bar_chart(df, score_columns[2])
|
194 |
+
|
195 |
+
col3, col4 = st.columns(2)
|
196 |
+
with col3:
|
197 |
+
create_bar_chart(df, score_columns[3])
|
198 |
+
with col4:
|
199 |
+
create_bar_chart(df, score_columns[4])
|
200 |
+
|
201 |
+
except Exception as e:
|
202 |
+
st.error("An error occurred while processing the markdown table.")
|
203 |
+
st.error(str(e))
|
204 |
+
else:
|
205 |
+
st.error("Failed to download the content from the URL provided.")
|
206 |
+
|
207 |
+
with tab2:
|
208 |
+
st.markdown('''
|
209 |
+
### Nous benchmark suite
|
210 |
+
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
|
211 |
+
* [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
|
212 |
+
* **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
|
213 |
+
* [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
|
214 |
+
* [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
|
215 |
+
### Reproducibility
|
216 |
+
You can easily reproduce these results using π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
|
217 |
+
### Clone this space
|
218 |
+
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
|
219 |
+
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
|
220 |
+
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
|
221 |
+
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
|
222 |
+
''')
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
main()
|