Jofthomas HF staff commited on
Commit
e8631c9
·
1 Parent(s): 33e4196
Files changed (2) hide show
  1. app.py +10 -39
  2. yall.py +7 -18
app.py CHANGED
@@ -100,7 +100,7 @@ def main():
100
  with tab1:
101
  if content:
102
  try:
103
- score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
104
 
105
  # Display dataframe
106
  full_df = convert_markdown_table_to_dataframe(content)
@@ -111,26 +111,9 @@ def main():
111
  full_df['Tags'] = full_df['Tags'].fillna('')
112
  df = pd.DataFrame(columns=full_df.columns)
113
 
114
- # Toggles
115
- col1, col2, col3 = st.columns(3)
116
- with col1:
117
- show_phi = st.checkbox("Phi (2.8B)", value=True)
118
- with col2:
119
- show_mistral = st.checkbox("Mistral (7B)", value=True)
120
- with col3:
121
- show_other = st.checkbox("Other", value=True)
122
-
123
  # Create a DataFrame based on selected filters
124
  dfs_to_concat = []
125
 
126
- if show_phi:
127
- dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
128
- if show_mistral:
129
- dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
130
- if show_other:
131
- other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
132
- dfs_to_concat.append(other_df)
133
-
134
  # Concatenate the DataFrames
135
  if dfs_to_concat:
136
  df = pd.concat(dfs_to_concat, ignore_index=True)
@@ -219,27 +202,15 @@ def main():
219
  # About tab
220
  with tab2:
221
  st.markdown('''
222
- ### Nous benchmark suite
223
-
224
- Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
225
-
226
- * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
227
- * **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
228
- * [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
229
- * [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
230
-
231
- ### Reproducibility
232
-
233
- You can easily reproduce these results using 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
234
-
235
- ### Clone this space
236
-
237
- You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
238
-
239
- * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
240
- * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
241
-
242
- A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations and [CultriX](https://huggingface.co/CultriX) for the CSV export and search bar.
243
  ''')
244
 
245
  if __name__ == "__main__":
 
100
  with tab1:
101
  if content:
102
  try:
103
+ score_columns = ['Elo']
104
 
105
  # Display dataframe
106
  full_df = convert_markdown_table_to_dataframe(content)
 
111
  full_df['Tags'] = full_df['Tags'].fillna('')
112
  df = pd.DataFrame(columns=full_df.columns)
113
 
 
 
 
 
 
 
 
 
 
114
  # Create a DataFrame based on selected filters
115
  dfs_to_concat = []
116
 
 
 
 
 
 
 
 
 
117
  # Concatenate the DataFrames
118
  if dfs_to_concat:
119
  df = pd.concat(dfs_to_concat, ignore_index=True)
 
202
  # About tab
203
  with tab2:
204
  st.markdown('''
205
+ ### Roleplay Leaderboard
206
+
207
+ This space is here to present the results from the Matou-Garou space, where human and AI play a game of werewolf.
208
+
209
+ It is meant as a social experience to see if you would be able to detect if talking to an AI.
210
+ We also hope that this leaderboard can be used by video game creator in the future to select what model to select for LLM based NPCs
211
+
212
+ Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks
213
+ Leaderboard copied from [Maxime Labonne](https://huggingface.co/mlabonne)
 
 
 
 
 
 
 
 
 
 
 
 
214
  ''')
215
 
216
  if __name__ == "__main__":
yall.py CHANGED
@@ -12,6 +12,7 @@ class GistInfo:
12
  model_name: str
13
  model_id: str
14
  model: str
 
15
  agieval: float
16
  gpt4all: float
17
  truthfulqa: float
@@ -59,11 +60,7 @@ def create_yall():
59
  model_name="Model 1",
60
  model_id="model-1",
61
  model="Model 1",
62
- agieval=95.4,
63
- gpt4all=88.7,
64
- truthfulqa=90.3,
65
- bigbench=85.6,
66
- average=90.0
67
  ),
68
  GistInfo(
69
  gist_id="dummy_gist_id_2",
@@ -72,11 +69,7 @@ def create_yall():
72
  model_name="Model 2",
73
  model_id="model-2",
74
  model="Model 2",
75
- agieval=89.1,
76
- gpt4all=85.0,
77
- truthfulqa=87.5,
78
- bigbench=83.0,
79
- average=86.2
80
  ),
81
  GistInfo(
82
  gist_id="dummy_gist_id_3",
@@ -85,11 +78,7 @@ def create_yall():
85
  model_name="Model 3",
86
  model_id="model-3",
87
  model="Model 3",
88
- agieval=78.2,
89
- gpt4all=81.4,
90
- truthfulqa=79.5,
91
- bigbench=77.0,
92
- average=79.0
93
  )
94
  ]
95
 
@@ -97,12 +86,12 @@ def create_yall():
97
  gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
98
 
99
  # Create markdown table
100
- markdown_table = "| Model | Average | AGIEval | GPT4All | TruthfulQA | Bigbench |\n"
101
- markdown_table += "|---|---:|---:|---:|---:|---:|\n"
102
 
103
  for gist in gist_infos:
104
  model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
105
- markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.agieval} | {gist.gpt4all} | {gist.truthfulqa} | {gist.bigbench} |\n"
106
 
107
  # Update YALL's gist with dummy gist ID and token
108
  update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")
 
12
  model_name: str
13
  model_id: str
14
  model: str
15
+ elo:float
16
  agieval: float
17
  gpt4all: float
18
  truthfulqa: float
 
60
  model_name="Model 1",
61
  model_id="model-1",
62
  model="Model 1",
63
+ elo=1900
 
 
 
 
64
  ),
65
  GistInfo(
66
  gist_id="dummy_gist_id_2",
 
69
  model_name="Model 2",
70
  model_id="model-2",
71
  model="Model 2",
72
+ elo=2000
 
 
 
 
73
  ),
74
  GistInfo(
75
  gist_id="dummy_gist_id_3",
 
78
  model_name="Model 3",
79
  model_id="model-3",
80
  model="Model 3",
81
+ elo=2200
 
 
 
 
82
  )
83
  ]
84
 
 
86
  gist_infos = sorted(gist_infos, key=lambda x: x.average, reverse=True)
87
 
88
  # Create markdown table
89
+ markdown_table = "| Model | Average | Elo |\n"
90
+ markdown_table += "|---|---:|---:|\n"
91
 
92
  for gist in gist_infos:
93
  model_link = f"[{gist.model_id}](https://huggingface.co/{gist.model_id})"
94
+ markdown_table += f"| {model_link} [📄]({gist.url}) | {gist.average} | {gist.elo}\n"
95
 
96
  # Update YALL's gist with dummy gist ID and token
97
  update_gist(content=markdown_table, gist_id="dummy_gist_id_yall", access_token="dummy_access_token")