Spaces:
Runtime error
Runtime error
bulk 1
Browse files
app.py
CHANGED
@@ -100,7 +100,7 @@ def main():
|
|
100 |
with tab1:
|
101 |
if content:
|
102 |
try:
|
103 |
-
score_columns = ['
|
104 |
|
105 |
# Display dataframe
|
106 |
full_df = convert_markdown_table_to_dataframe(content)
|
@@ -111,26 +111,9 @@ def main():
|
|
111 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
112 |
df = pd.DataFrame(columns=full_df.columns)
|
113 |
|
114 |
-
# Toggles
|
115 |
-
col1, col2, col3 = st.columns(3)
|
116 |
-
with col1:
|
117 |
-
show_phi = st.checkbox("Phi (2.8B)", value=True)
|
118 |
-
with col2:
|
119 |
-
show_mistral = st.checkbox("Mistral (7B)", value=True)
|
120 |
-
with col3:
|
121 |
-
show_other = st.checkbox("Other", value=True)
|
122 |
-
|
123 |
# Create a DataFrame based on selected filters
|
124 |
dfs_to_concat = []
|
125 |
|
126 |
-
if show_phi:
|
127 |
-
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
|
128 |
-
if show_mistral:
|
129 |
-
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
|
130 |
-
if show_other:
|
131 |
-
other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
|
132 |
-
dfs_to_concat.append(other_df)
|
133 |
-
|
134 |
# Concatenate the DataFrames
|
135 |
if dfs_to_concat:
|
136 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
@@ -219,27 +202,15 @@ def main():
|
|
219 |
# About tab
|
220 |
with tab2:
|
221 |
st.markdown('''
|
222 |
-
###
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
### Reproducibility
|
232 |
-
|
233 |
-
You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
|
234 |
-
|
235 |
-
### Clone this space
|
236 |
-
|
237 |
-
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
|
238 |
-
|
239 |
-
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
|
240 |
-
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
|
241 |
-
|
242 |
-
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations and [CultriX](https://huggingface.co/CultriX) for the CSV export and search bar.
|
243 |
''')
|
244 |
|
245 |
if __name__ == "__main__":
|
|
|
100 |
with tab1:
|
101 |
if content:
|
102 |
try:
|
103 |
+
score_columns = ['Elo']
|
104 |
|
105 |
# Display dataframe
|
106 |
full_df = convert_markdown_table_to_dataframe(content)
|
|
|
111 |
full_df['Tags'] = full_df['Tags'].fillna('')
|
112 |
df = pd.DataFrame(columns=full_df.columns)
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
# Create a DataFrame based on selected filters
|
115 |
dfs_to_concat = []
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
# Concatenate the DataFrames
|
118 |
if dfs_to_concat:
|
119 |
df = pd.concat(dfs_to_concat, ignore_index=True)
|
|
|
202 |
# About tab
|
203 |
with tab2:
|
204 |
st.markdown('''
|
205 |
+
### Roleplay Leaderboard
|
206 |
+
|
207 |
+
This space is here to present the results from the Matou-Garou space, where human and AI play a game of werewolf.
|
208 |
+
|
209 |
+
It is meant as a social experience to see if you would be able to detect if talking to an AI.
|
210 |
+
We also hope that this leaderboard can be used by video game creator in the future to select what model to select for LLM based NPCs
|
211 |
+
|
212 |
+
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks
|
213 |
+
Leaderboard copied from [Maxime Labonne](https://huggingface.co/mlabonne)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
''')
|
215 |
|
216 |
if __name__ == "__main__":
|
yall.py
CHANGED
@@ -12,6 +12,7 @@ class GistInfo:
|
|
12 |
model_name: str
|
13 |
model_id: str
|
14 |
model: str
|
|
|
15 |
agieval: float
|
16 |
gpt4all: float
|
17 |
truthfulqa: float
|
@@ -59,11 +60,7 @@ def create_yall():
|
|
59 |
model_name="Model 1",
|
60 |
model_id="model-1",
|
61 |
model="Model 1",
|
62 |
-
|
63 |
-
gpt4all=88.7,
|
64 |
-
truthfulqa=90.3,
|
65 |
-
bigbench=85.6,
|
66 |
-
average=90.0
|
67 |
),
|
68 |
GistInfo(
|
69 |
gist_id="dummy_gist_id_2",
|
@@ -72,11 +69,7 @@ def create_yall():
|
|
72 |
model_name="Model 2",
|
73 |
model_id="model-2",
|
74 |
model="Model 2",
|
75 |
-
|
76 |
-
gpt4all=85.0,
|
77 |
-
truthfulqa=87.5,
|
78 |
-
bigbench=83.0,
|
79 |
-
average=86.2
|
80 |
),
|
81 |
GistInfo(
|
82 |
gist_id="dummy_gist_id_3",
|
@@ -85,11 +78,7 @@ def create_yall():
|
|
85 |
model_name="Model 3",
|
86 |
model_id="model-3",
|
87 |
model="Model 3",
|
88 |
-
|
89 |
-
gpt4all=81.4,
|
90 |
-
truthfulqa=79.5,
|
91 |
-
bigbench=77.0,
|
92 |
-
average=79.0
|
93 |
)
|
94 |
]
|
95 |
|
@@ -97,12 +86,12 @@ def create_yall():
|
|
97 |
gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
|
98 |
|
99 |
# Create markdown table
|
100 |
-
markdown_table = "| Model | Average |
|
101 |
-
markdown_table += "
|
102 |
|
103 |
for gist in gist_infos:
|
104 |
model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
|
105 |
-
markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.
|
106 |
|
107 |
# Update YALL's gist with dummy gist ID and token
|
108 |
update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")
|
|
|
12 |
model_name: str
|
13 |
model_id: str
|
14 |
model: str
|
15 |
+
elo:float
|
16 |
agieval: float
|
17 |
gpt4all: float
|
18 |
truthfulqa: float
|
|
|
60 |
model_name="Model 1",
|
61 |
model_id="model-1",
|
62 |
model="Model 1",
|
63 |
+
elo=1900
|
|
|
|
|
|
|
|
|
64 |
),
|
65 |
GistInfo(
|
66 |
gist_id="dummy_gist_id_2",
|
|
|
69 |
model_name="Model 2",
|
70 |
model_id="model-2",
|
71 |
model="Model 2",
|
72 |
+
elo=2000
|
|
|
|
|
|
|
|
|
73 |
),
|
74 |
GistInfo(
|
75 |
gist_id="dummy_gist_id_3",
|
|
|
78 |
model_name="Model 3",
|
79 |
model_id="model-3",
|
80 |
model="Model 3",
|
81 |
+
elo=2200
|
|
|
|
|
|
|
|
|
82 |
)
|
83 |
]
|
84 |
|
|
|
86 |
gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
|
87 |
|
88 |
# Create markdown table
|
89 |
+
markdown_table = "| Model | Average | Elo |\n"
|
90 |
+
markdown_table += "|---|---:|---:|\n"
|
91 |
|
92 |
for gist in gist_infos:
|
93 |
model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
|
94 |
+
markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.elo}\n"
|
95 |
|
96 |
# Update YALL's gist with dummy gist ID and token
|
97 |
update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")
|