lewtun HF staff commited on
Commit
8a31745
·
1 Parent(s): f63499d
Files changed (2) hide show
  1. app.py +23 -24
  2. debug.ipynb +127 -357
app.py CHANGED
@@ -13,7 +13,7 @@ Evaluation of H4 and community models across a diverse range of benchmarks from
13
  BENCHMARKS_TO_SKIP = ["math", "mini_math"]
14
 
15
 
16
- def get_leaderboard_df(merge_values: bool = True):
17
  filepaths = list(Path("eval_results").rglob("*.json"))
18
 
19
  # Parse filepaths to get unique models
@@ -29,9 +29,9 @@ def get_leaderboard_df(merge_values: bool = True):
29
  # Extract data from each file and populate the DataFrame
30
  for filepath in filepaths:
31
  path_parts = Path(filepath).parts
32
- date = filepath.stem.split("_")[-1][:-3].split("T")[0]
33
  model_revision = "_".join(path_parts[1:4]) + "_" + date
34
- task = path_parts[4] # .capitalize()
35
  df.loc[model_revision, "Date"] = date
36
 
37
  with open(filepath, "r") as file:
@@ -115,13 +115,14 @@ def get_leaderboard_df(merge_values: bool = True):
115
 
116
  # Drop rows where every entry is NaN
117
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
 
118
  # Trim minimath column names
119
  df.columns = [c.replace("_level_", "_l") for c in df.columns]
120
 
121
  # Trim AIMO column names
122
  df.columns = [c.replace("aimo_", "") for c in df.columns]
123
 
124
- df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
125
 
126
  # Convert all values to percentage
127
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
@@ -130,22 +131,19 @@ def get_leaderboard_df(merge_values: bool = True):
130
  # Strip off date from model name
131
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
132
 
133
- if merge_values:
134
- merged_df = df.drop(["Date", "Average"], axis=1).groupby("Model").max().reset_index()
135
- merged_df.insert(loc=0, column="Average", value=merged_df.mean(axis=1, numeric_only=True))
136
- df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
137
- df.drop_duplicates(subset=["Model"], inplace=True)
138
- df = df.sort_values(by=["Average"], ascending=False).round(2)
139
  return df
140
 
141
 
142
- def refresh(merge_values: bool = True):
143
- return get_leaderboard_df(merge_values)
144
 
145
 
146
  # Function to update the table based on search query
147
- def update_table(search_query):
148
- df = get_leaderboard_df()
149
  if search_query:
150
  search_terms = search_query.split(";")
151
  search_terms = [term.strip().lower() for term in search_terms]
@@ -157,14 +155,14 @@ def update_table(search_query):
157
 
158
 
159
  def filter_columns(cols):
160
- index_cols = list(leaderboard_df.columns[:2])
161
  new_cols = index_cols + cols
162
  df = get_leaderboard_df()
163
  df = df.copy()[new_cols]
164
  # Drop rows with NaN values
165
  df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
166
  # Recompute average
167
- df.insert(loc=2, column="Average", value=df.mean(axis=1, numeric_only=True))
168
  return df
169
 
170
 
@@ -178,14 +176,15 @@ with demo:
178
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
179
  with gr.Row():
180
  search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
181
- merge_values = gr.Checkbox(
182
- value=True,
183
- label="Merge evals",
184
- info="Merge evals for the same model. If there are duplicates, we display the largest one.",
 
185
  )
186
  with gr.Row():
187
  cols_bar = gr.CheckboxGroup(
188
- choices=[c for c in leaderboard_df.columns[2:] if c != "Average"],
189
  show_label=False,
190
  info="Select columns to display",
191
  )
@@ -195,14 +194,14 @@ with demo:
195
  value=leaderboard_df,
196
  wrap=True,
197
  height=1000,
198
- column_widths=[400, 110] + [(220 + len(c)) for c in leaderboard_df.columns[2:]],
199
  )
200
  with gr.Row():
201
  refresh_button = gr.Button("Refresh")
202
 
203
  cols_bar.change(filter_columns, inputs=[cols_bar], outputs=[leaderboard_table])
204
- merge_values.change(refresh, inputs=[merge_values], outputs=[leaderboard_table])
205
- search_bar.submit(update_table, inputs=[search_bar], outputs=[leaderboard_table])
206
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
207
 
208
  demo.launch()
 
13
  BENCHMARKS_TO_SKIP = ["math", "mini_math"]
14
 
15
 
16
+ def get_leaderboard_df(agg: str = "max"):
17
  filepaths = list(Path("eval_results").rglob("*.json"))
18
 
19
  # Parse filepaths to get unique models
 
29
  # Extract data from each file and populate the DataFrame
30
  for filepath in filepaths:
31
  path_parts = Path(filepath).parts
32
+ date = filepath.stem.split("_")[-1][:-3]
33
  model_revision = "_".join(path_parts[1:4]) + "_" + date
34
+ task = path_parts[4]
35
  df.loc[model_revision, "Date"] = date
36
 
37
  with open(filepath, "r") as file:
 
115
 
116
  # Drop rows where every entry is NaN
117
  df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
118
+
119
  # Trim minimath column names
120
  df.columns = [c.replace("_level_", "_l") for c in df.columns]
121
 
122
  # Trim AIMO column names
123
  df.columns = [c.replace("aimo_", "") for c in df.columns]
124
 
125
+ df.insert(loc=0, column="Average", value=df.mean(axis=1, numeric_only=True))
126
 
127
  # Convert all values to percentage
128
  df[df.select_dtypes(include=["number"]).columns] *= 100.0
 
131
  # Strip off date from model name
132
  df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
133
 
134
+ # Drop date and aggregate results by model name
135
+ df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()
136
+
 
 
 
137
  return df
138
 
139
 
140
+ def refresh(agg: str = "max"):
141
+ return get_leaderboard_df(agg=agg)
142
 
143
 
144
  # Function to update the table based on search query
145
+ def update_table(search_query, agg):
146
+ df = get_leaderboard_df(agg)
147
  if search_query:
148
  search_terms = search_query.split(";")
149
  search_terms = [term.strip().lower() for term in search_terms]
 
155
 
156
 
157
  def filter_columns(cols):
158
+ index_cols = list(leaderboard_df.columns[:1])
159
  new_cols = index_cols + cols
160
  df = get_leaderboard_df()
161
  df = df.copy()[new_cols]
162
  # Drop rows with NaN values
163
  df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
164
  # Recompute average
165
+ df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
166
  return df
167
 
168
 
 
176
  gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
177
  with gr.Row():
178
  search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
179
+ agg = gr.Radio(
180
+ ["min", "max", "mean"],
181
+ value="max",
182
+ label="Aggregation",
183
+ info="How to aggregate results for each model",
184
  )
185
  with gr.Row():
186
  cols_bar = gr.CheckboxGroup(
187
+ choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
188
  show_label=False,
189
  info="Select columns to display",
190
  )
 
194
  value=leaderboard_df,
195
  wrap=True,
196
  height=1000,
197
+ column_widths=[400, 110] + [(220 + len(c)) for c in leaderboard_df.columns[1:]],
198
  )
199
  with gr.Row():
200
  refresh_button = gr.Button("Refresh")
201
 
202
  cols_bar.change(filter_columns, inputs=[cols_bar], outputs=[leaderboard_table])
203
+ agg.change(refresh, inputs=[agg], outputs=[leaderboard_table])
204
+ search_bar.submit(update_table, inputs=[search_bar, agg], outputs=[leaderboard_table])
205
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table])
206
 
207
  demo.launch()
debug.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -10,16 +10,18 @@
10
  "from pathlib import Path\n",
11
  "\n",
12
  "import gradio as gr\n",
13
- "import pandas as pd"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 3,
19
- "metadata": {},
20
- "outputs": [],
21
- "source": [
22
- "def get_leaderboard_df():\n",
 
 
23
  " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
24
  "\n",
25
  " # Parse filepaths to get unique models\n",
@@ -35,14 +37,17 @@
35
  " # Extract data from each file and populate the DataFrame\n",
36
  " for filepath in filepaths:\n",
37
  " path_parts = Path(filepath).parts\n",
38
- " date = filepath.stem.split(\"_\")[-1][:-3].split(\"T\")[0]\n",
39
  " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n",
40
- " task = path_parts[4].capitalize()\n",
41
  " df.loc[model_revision, \"Date\"] = date\n",
42
  "\n",
43
  " with open(filepath, \"r\") as file:\n",
44
  " data = json.load(file)\n",
45
  " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
 
 
 
46
  " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
47
  " if task.lower() == \"truthfulqa\":\n",
48
  " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
@@ -51,44 +56,116 @@
51
  " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n",
52
  " # MMLU has several metrics but we report just the average one\n",
53
  " elif task.lower() == \"mmlu\":\n",
54
- " value = data[\"results\"][\"lighteval|mmlu:_average|5\"][\"acc\"]\n",
55
  " # HellaSwag and ARC reports acc_norm\n",
56
  " elif task.lower() in [\"hellaswag\", \"arc\"]:\n",
57
  " value = data[\"results\"][first_result_key][\"acc_norm\"]\n",
 
 
 
 
 
 
 
 
 
 
 
 
58
  " else:\n",
59
  " first_metric_key = next(\n",
60
  " iter(data[\"results\"][first_result_key])\n",
61
  " ) # gets the first key in the first result\n",
62
  " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
63
- " df.loc[model_revision, task] = value\n",
64
  "\n",
65
- " # Put IFEval in first column\n",
66
- " ifeval_col = df.pop(\"Ifeval\")\n",
67
- " df.insert(1, \"Ifeval\", ifeval_col)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  " # Drop rows where every entry is NaN\n",
69
  " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
70
- " df.insert(loc=1, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
 
 
 
 
 
 
 
 
71
  " # Convert all values to percentage\n",
72
  " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n",
73
  " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
74
  " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n",
75
  " # Strip off date from model name\n",
76
  " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
 
 
 
 
77
  " return df"
78
  ]
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": 4,
 
 
 
 
 
 
 
 
 
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
86
- "df = get_leaderboard_df()"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
- "execution_count": 5,
92
  "metadata": {},
93
  "outputs": [
94
  {
@@ -113,208 +190,43 @@
113
  " <tr style=\"text-align: right;\">\n",
114
  " <th></th>\n",
115
  " <th>Model</th>\n",
116
- " <th>Date</th>\n",
117
  " <th>Average</th>\n",
118
- " <th>Ifeval</th>\n",
119
- " <th>Truthfulqa</th>\n",
120
- " <th>Winogrande</th>\n",
121
- " <th>Gsm8k</th>\n",
122
- " <th>Mmlu</th>\n",
123
- " <th>Hellaswag</th>\n",
124
- " <th>Arc</th>\n",
125
  " </tr>\n",
126
  " </thead>\n",
127
  " <tbody>\n",
128
  " <tr>\n",
129
- " <th>0</th>\n",
130
- " <td>NousResearch_Nous-Hermes-2-Yi-34B_main</td>\n",
131
- " <td>2024-03-04</td>\n",
132
- " <td>74.01</td>\n",
133
- " <td>NaN</td>\n",
134
- " <td>61.44</td>\n",
135
- " <td>80.58</td>\n",
136
- " <td>NaN</td>\n",
137
- " <td>76.24</td>\n",
138
- " <td>83.79</td>\n",
139
- " <td>68.00</td>\n",
140
- " </tr>\n",
141
- " <tr>\n",
142
- " <th>1</th>\n",
143
- " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
144
- " <td>2024-03-05</td>\n",
145
- " <td>71.62</td>\n",
146
- " <td>55.27</td>\n",
147
- " <td>NaN</td>\n",
148
- " <td>NaN</td>\n",
149
- " <td>76.12</td>\n",
150
- " <td>71.18</td>\n",
151
- " <td>83.94</td>\n",
152
- " <td>NaN</td>\n",
153
- " </tr>\n",
154
- " <tr>\n",
155
- " <th>2</th>\n",
156
- " <td>NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main</td>\n",
157
- " <td>2024-03-02</td>\n",
158
- " <td>70.43</td>\n",
159
- " <td>59.33</td>\n",
160
- " <td>64.76</td>\n",
161
- " <td>78.53</td>\n",
162
- " <td>62.17</td>\n",
163
- " <td>71.96</td>\n",
164
- " <td>85.42</td>\n",
165
- " <td>70.82</td>\n",
166
- " </tr>\n",
167
- " <tr>\n",
168
- " <th>3</th>\n",
169
- " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
170
- " <td>2024-03-02</td>\n",
171
- " <td>69.80</td>\n",
172
- " <td>55.08</td>\n",
173
- " <td>70.79</td>\n",
174
- " <td>73.56</td>\n",
175
- " <td>59.89</td>\n",
176
- " <td>70.60</td>\n",
177
- " <td>86.68</td>\n",
178
- " <td>72.01</td>\n",
179
- " </tr>\n",
180
- " <tr>\n",
181
- " <th>4</th>\n",
182
- " <td>deepseek-ai_deepseek-llm-67b-chat_main</td>\n",
183
- " <td>2024-03-04</td>\n",
184
- " <td>67.03</td>\n",
185
- " <td>NaN</td>\n",
186
- " <td>57.78</td>\n",
187
- " <td>79.16</td>\n",
188
- " <td>NaN</td>\n",
189
- " <td>NaN</td>\n",
190
- " <td>NaN</td>\n",
191
- " <td>64.16</td>\n",
192
- " </tr>\n",
193
- " <tr>\n",
194
- " <th>...</th>\n",
195
- " <td>...</td>\n",
196
- " <td>...</td>\n",
197
- " <td>...</td>\n",
198
- " <td>...</td>\n",
199
- " <td>...</td>\n",
200
- " <td>...</td>\n",
201
- " <td>...</td>\n",
202
- " <td>...</td>\n",
203
- " <td>...</td>\n",
204
- " <td>...</td>\n",
205
- " </tr>\n",
206
- " <tr>\n",
207
- " <th>269</th>\n",
208
- " <td>HuggingFaceH4_starcoder2-15b-ift_v18.0</td>\n",
209
- " <td>2024-03-10</td>\n",
210
- " <td>11.23</td>\n",
211
- " <td>21.63</td>\n",
212
- " <td>NaN</td>\n",
213
- " <td>NaN</td>\n",
214
- " <td>0.83</td>\n",
215
- " <td>NaN</td>\n",
216
- " <td>NaN</td>\n",
217
- " <td>NaN</td>\n",
218
- " </tr>\n",
219
- " <tr>\n",
220
- " <th>270</th>\n",
221
- " <td>HuggingFaceH4_mistral-7b-ift_v49.0</td>\n",
222
- " <td>2024-03-07</td>\n",
223
- " <td>10.07</td>\n",
224
- " <td>20.15</td>\n",
225
- " <td>NaN</td>\n",
226
- " <td>NaN</td>\n",
227
- " <td>0.00</td>\n",
228
- " <td>NaN</td>\n",
229
- " <td>NaN</td>\n",
230
- " <td>NaN</td>\n",
231
- " </tr>\n",
232
- " <tr>\n",
233
- " <th>271</th>\n",
234
- " <td>HuggingFaceH4_starchat-beta_main</td>\n",
235
- " <td>2024-03-12</td>\n",
236
- " <td>8.13</td>\n",
237
- " <td>8.13</td>\n",
238
- " <td>NaN</td>\n",
239
- " <td>NaN</td>\n",
240
- " <td>NaN</td>\n",
241
- " <td>NaN</td>\n",
242
- " <td>NaN</td>\n",
243
- " <td>NaN</td>\n",
244
- " </tr>\n",
245
- " <tr>\n",
246
- " <th>272</th>\n",
247
- " <td>HuggingFaceH4_starcoder2-15b-ift_v7.0</td>\n",
248
- " <td>2024-03-10</td>\n",
249
- " <td>7.88</td>\n",
250
- " <td>12.57</td>\n",
251
- " <td>NaN</td>\n",
252
- " <td>NaN</td>\n",
253
- " <td>3.18</td>\n",
254
- " <td>NaN</td>\n",
255
- " <td>NaN</td>\n",
256
- " <td>NaN</td>\n",
257
- " </tr>\n",
258
- " <tr>\n",
259
- " <th>273</th>\n",
260
- " <td>HuggingFaceH4_zephyr-7b-beta-ift_v1.1</td>\n",
261
- " <td>2024-03-13</td>\n",
262
- " <td>4.71</td>\n",
263
- " <td>9.43</td>\n",
264
- " <td>NaN</td>\n",
265
- " <td>NaN</td>\n",
266
- " <td>0.00</td>\n",
267
- " <td>NaN</td>\n",
268
- " <td>NaN</td>\n",
269
- " <td>NaN</td>\n",
270
  " </tr>\n",
271
  " </tbody>\n",
272
  "</table>\n",
273
- "<p>274 rows × 10 columns</p>\n",
274
  "</div>"
275
  ],
276
  "text/plain": [
277
- " Model Date Average \\\n",
278
- "0 NousResearch_Nous-Hermes-2-Yi-34B_main 2024-03-04 74.01 \n",
279
- "1 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-05 71.62 \n",
280
- "2 NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main 2024-03-02 70.43 \n",
281
- "3 mistralai_Mixtral-8x7B-Instruct-v0.1_main 2024-03-02 69.80 \n",
282
- "4 deepseek-ai_deepseek-llm-67b-chat_main 2024-03-04 67.03 \n",
283
- ".. ... ... ... \n",
284
- "269 HuggingFaceH4_starcoder2-15b-ift_v18.0 2024-03-10 11.23 \n",
285
- "270 HuggingFaceH4_mistral-7b-ift_v49.0 2024-03-07 10.07 \n",
286
- "271 HuggingFaceH4_starchat-beta_main 2024-03-12 8.13 \n",
287
- "272 HuggingFaceH4_starcoder2-15b-ift_v7.0 2024-03-10 7.88 \n",
288
- "273 HuggingFaceH4_zephyr-7b-beta-ift_v1.1 2024-03-13 4.71 \n",
289
  "\n",
290
- " Ifeval Truthfulqa Winogrande Gsm8k Mmlu Hellaswag Arc \n",
291
- "0 NaN 61.44 80.58 NaN 76.24 83.79 68.00 \n",
292
- "1 55.27 NaN NaN 76.12 71.18 83.94 NaN \n",
293
- "2 59.33 64.76 78.53 62.17 71.96 85.42 70.82 \n",
294
- "3 55.08 70.79 73.56 59.89 70.60 86.68 72.01 \n",
295
- "4 NaN 57.78 79.16 NaN NaN NaN 64.16 \n",
296
- ".. ... ... ... ... ... ... ... \n",
297
- "269 21.63 NaN NaN 0.83 NaN NaN NaN \n",
298
- "270 20.15 NaN NaN 0.00 NaN NaN NaN \n",
299
- "271 8.13 NaN NaN NaN NaN NaN NaN \n",
300
- "272 12.57 NaN NaN 3.18 NaN NaN NaN \n",
301
- "273 9.43 NaN NaN 0.00 NaN NaN NaN \n",
302
- "\n",
303
- "[274 rows x 10 columns]"
304
  ]
305
  },
306
- "execution_count": 5,
307
  "metadata": {},
308
  "output_type": "execute_result"
309
  }
310
  ],
311
  "source": [
312
- "df"
313
  ]
314
  },
315
  {
316
  "cell_type": "code",
317
- "execution_count": 14,
318
  "metadata": {},
319
  "outputs": [
320
  {
@@ -339,180 +251,38 @@
339
  " <tr style=\"text-align: right;\">\n",
340
  " <th></th>\n",
341
  " <th>Model</th>\n",
342
- " <th>Ifeval</th>\n",
343
- " <th>Truthfulqa</th>\n",
344
- " <th>Winogrande</th>\n",
345
- " <th>Gsm8k</th>\n",
346
- " <th>Mmlu</th>\n",
347
- " <th>Hellaswag</th>\n",
348
- " <th>Arc</th>\n",
349
  " </tr>\n",
350
  " </thead>\n",
351
  " <tbody>\n",
352
  " <tr>\n",
353
- " <th>0</th>\n",
354
- " <td>HuggingFaceH4_mistral-7b-ift_v41.0</td>\n",
355
- " <td>44.36</td>\n",
356
- " <td>49.35</td>\n",
357
- " <td>72.93</td>\n",
358
- " <td>37.30</td>\n",
359
- " <td>60.82</td>\n",
360
- " <td>79.70</td>\n",
361
- " <td>58.36</td>\n",
362
- " </tr>\n",
363
- " <tr>\n",
364
- " <th>1</th>\n",
365
- " <td>HuggingFaceH4_mistral-7b-ift_v41.1</td>\n",
366
- " <td>47.32</td>\n",
367
- " <td>47.89</td>\n",
368
- " <td>72.69</td>\n",
369
- " <td>36.32</td>\n",
370
- " <td>60.34</td>\n",
371
- " <td>79.57</td>\n",
372
- " <td>57.51</td>\n",
373
- " </tr>\n",
374
- " <tr>\n",
375
- " <th>2</th>\n",
376
- " <td>HuggingFaceH4_mistral-7b-ift_v41.10</td>\n",
377
- " <td>32.72</td>\n",
378
- " <td>51.05</td>\n",
379
- " <td>72.45</td>\n",
380
- " <td>25.93</td>\n",
381
- " <td>59.75</td>\n",
382
- " <td>81.92</td>\n",
383
- " <td>59.22</td>\n",
384
- " </tr>\n",
385
- " <tr>\n",
386
- " <th>3</th>\n",
387
- " <td>HuggingFaceH4_mistral-7b-ift_v41.11</td>\n",
388
- " <td>37.89</td>\n",
389
- " <td>51.05</td>\n",
390
- " <td>64.56</td>\n",
391
- " <td>17.59</td>\n",
392
- " <td>57.60</td>\n",
393
- " <td>77.65</td>\n",
394
- " <td>55.89</td>\n",
395
- " </tr>\n",
396
- " <tr>\n",
397
- " <th>4</th>\n",
398
- " <td>HuggingFaceH4_mistral-7b-ift_v41.12</td>\n",
399
- " <td>37.89</td>\n",
400
- " <td>45.94</td>\n",
401
- " <td>63.30</td>\n",
402
- " <td>21.15</td>\n",
403
- " <td>58.50</td>\n",
404
- " <td>74.94</td>\n",
405
- " <td>52.73</td>\n",
406
- " </tr>\n",
407
- " <tr>\n",
408
- " <th>...</th>\n",
409
- " <td>...</td>\n",
410
- " <td>...</td>\n",
411
- " <td>...</td>\n",
412
- " <td>...</td>\n",
413
- " <td>...</td>\n",
414
- " <td>...</td>\n",
415
- " <td>...</td>\n",
416
- " <td>...</td>\n",
417
- " </tr>\n",
418
- " <tr>\n",
419
- " <th>258</th>\n",
420
- " <td>mistralai_Mistral-7B-Instruct-v0.2_main</td>\n",
421
- " <td>53.97</td>\n",
422
- " <td>70.68</td>\n",
423
- " <td>68.82</td>\n",
424
- " <td>38.13</td>\n",
425
- " <td>59.43</td>\n",
426
- " <td>83.45</td>\n",
427
- " <td>65.70</td>\n",
428
- " </tr>\n",
429
- " <tr>\n",
430
- " <th>259</th>\n",
431
- " <td>mistralai_Mixtral-8x7B-Instruct-v0.1_main</td>\n",
432
- " <td>55.08</td>\n",
433
- " <td>70.79</td>\n",
434
- " <td>73.56</td>\n",
435
- " <td>59.89</td>\n",
436
- " <td>70.60</td>\n",
437
- " <td>86.68</td>\n",
438
- " <td>72.01</td>\n",
439
- " </tr>\n",
440
- " <tr>\n",
441
- " <th>260</th>\n",
442
- " <td>openchat_openchat-3.5-0106_main</td>\n",
443
- " <td>54.71</td>\n",
444
- " <td>57.55</td>\n",
445
- " <td>72.53</td>\n",
446
- " <td>66.19</td>\n",
447
- " <td>63.72</td>\n",
448
- " <td>80.10</td>\n",
449
- " <td>61.01</td>\n",
450
- " </tr>\n",
451
- " <tr>\n",
452
- " <th>261</th>\n",
453
- " <td>stabilityai_stablelm-zephyr-3b_main</td>\n",
454
- " <td>34.75</td>\n",
455
- " <td>46.19</td>\n",
456
- " <td>58.41</td>\n",
457
- " <td>40.18</td>\n",
458
- " <td>45.18</td>\n",
459
- " <td>71.57</td>\n",
460
- " <td>45.82</td>\n",
461
- " </tr>\n",
462
- " <tr>\n",
463
- " <th>262</th>\n",
464
- " <td>teknium_OpenHermes-2.5-Mistral-7B_main</td>\n",
465
- " <td>52.68</td>\n",
466
- " <td>58.62</td>\n",
467
- " <td>72.14</td>\n",
468
- " <td>54.06</td>\n",
469
- " <td>63.01</td>\n",
470
- " <td>82.34</td>\n",
471
- " <td>62.97</td>\n",
472
  " </tr>\n",
473
  " </tbody>\n",
474
  "</table>\n",
475
- "<p>263 rows × 8 columns</p>\n",
476
  "</div>"
477
  ],
478
  "text/plain": [
479
- " Model Ifeval Truthfulqa \\\n",
480
- "0 HuggingFaceH4_mistral-7b-ift_v41.0 44.36 49.35 \n",
481
- "1 HuggingFaceH4_mistral-7b-ift_v41.1 47.32 47.89 \n",
482
- "2 HuggingFaceH4_mistral-7b-ift_v41.10 32.72 51.05 \n",
483
- "3 HuggingFaceH4_mistral-7b-ift_v41.11 37.89 51.05 \n",
484
- "4 HuggingFaceH4_mistral-7b-ift_v41.12 37.89 45.94 \n",
485
- ".. ... ... ... \n",
486
- "258 mistralai_Mistral-7B-Instruct-v0.2_main 53.97 70.68 \n",
487
- "259 mistralai_Mixtral-8x7B-Instruct-v0.1_main 55.08 70.79 \n",
488
- "260 openchat_openchat-3.5-0106_main 54.71 57.55 \n",
489
- "261 stabilityai_stablelm-zephyr-3b_main 34.75 46.19 \n",
490
- "262 teknium_OpenHermes-2.5-Mistral-7B_main 52.68 58.62 \n",
491
- "\n",
492
- " Winogrande Gsm8k Mmlu Hellaswag Arc \n",
493
- "0 72.93 37.30 60.82 79.70 58.36 \n",
494
- "1 72.69 36.32 60.34 79.57 57.51 \n",
495
- "2 72.45 25.93 59.75 81.92 59.22 \n",
496
- "3 64.56 17.59 57.60 77.65 55.89 \n",
497
- "4 63.30 21.15 58.50 74.94 52.73 \n",
498
- ".. ... ... ... ... ... \n",
499
- "258 68.82 38.13 59.43 83.45 65.70 \n",
500
- "259 73.56 59.89 70.60 86.68 72.01 \n",
501
- "260 72.53 66.19 63.72 80.10 61.01 \n",
502
- "261 58.41 40.18 45.18 71.57 45.82 \n",
503
- "262 72.14 54.06 63.01 82.34 62.97 \n",
504
  "\n",
505
- "[263 rows x 8 columns]"
 
506
  ]
507
  },
508
- "execution_count": 14,
509
  "metadata": {},
510
  "output_type": "execute_result"
511
  }
512
  ],
513
  "source": [
514
- "new_df = df.drop([\"Date\", \"Average\"], axis=1).groupby(\"Model\").max().reset_index()\n",
515
- "new_df"
516
  ]
517
  },
518
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 34,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
10
  "from pathlib import Path\n",
11
  "\n",
12
  "import gradio as gr\n",
13
+ "import pandas as pd\n",
14
+ "\n",
15
+ "TITLE = \"\"\"<h1 align=\"center\" id=\"space-title\">LLM Leaderboard for H4 Models</h1>\"\"\"\n",
16
+ "\n",
17
+ "DESCRIPTION = f\"\"\"\n",
18
+ "Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.\n",
19
+ "\"\"\"\n",
20
+ "\n",
21
+ "BENCHMARKS_TO_SKIP = [\"math\", \"mini_math\"]\n",
22
+ "\n",
23
+ "\n",
24
+ "def get_leaderboard_df(agg : str = \"max\"):\n",
25
  " filepaths = list(Path(\"eval_results\").rglob(\"*.json\"))\n",
26
  "\n",
27
  " # Parse filepaths to get unique models\n",
 
37
  " # Extract data from each file and populate the DataFrame\n",
38
  " for filepath in filepaths:\n",
39
  " path_parts = Path(filepath).parts\n",
40
+ " date = filepath.stem.split(\"_\")[-1][:-3]\n",
41
  " model_revision = \"_\".join(path_parts[1:4]) + \"_\" + date\n",
42
+ " task = path_parts[4]\n",
43
  " df.loc[model_revision, \"Date\"] = date\n",
44
  "\n",
45
  " with open(filepath, \"r\") as file:\n",
46
  " data = json.load(file)\n",
47
  " first_result_key = next(iter(data[\"results\"])) # gets the first key in 'results'\n",
48
+ " # Skip benchmarks that we don't want to include in the leaderboard\n",
49
+ " if task.lower() in BENCHMARKS_TO_SKIP:\n",
50
+ " continue\n",
51
  " # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard\n",
52
  " if task.lower() == \"truthfulqa\":\n",
53
  " value = data[\"results\"][first_result_key][\"truthfulqa_mc2\"]\n",
 
56
  " value = data[\"results\"][first_result_key][\"prompt_level_loose_acc\"]\n",
57
  " # MMLU has several metrics but we report just the average one\n",
58
  " elif task.lower() == \"mmlu\":\n",
59
+ " value = [v[\"acc\"] for k, v in data[\"results\"].items() if \"_average\" in k.lower()][0]\n",
60
  " # HellaSwag and ARC reports acc_norm\n",
61
  " elif task.lower() in [\"hellaswag\", \"arc\"]:\n",
62
  " value = data[\"results\"][first_result_key][\"acc_norm\"]\n",
63
+ " # BBH has several metrics but we report just the average one\n",
64
+ " elif task.lower() == \"bbh\":\n",
65
+ " if \"all\" in data[\"results\"]:\n",
66
+ " value = data[\"results\"][\"all\"][\"acc\"]\n",
67
+ " else:\n",
68
+ " value = -100\n",
69
+ " # AGIEval reports acc_norm\n",
70
+ " elif task.lower() == \"agieval\":\n",
71
+ " value = data[\"results\"][\"all\"][\"acc_norm\"]\n",
72
+ " # MATH reports qem\n",
73
+ " elif task.lower() in [\"math\", \"math_v2\", \"aimo_kaggle\"]:\n",
74
+ " value = data[\"results\"][\"all\"][\"qem\"]\n",
75
  " else:\n",
76
  " first_metric_key = next(\n",
77
  " iter(data[\"results\"][first_result_key])\n",
78
  " ) # gets the first key in the first result\n",
79
  " value = data[\"results\"][first_result_key][first_metric_key] # gets the value of the first metric\n",
 
80
  "\n",
81
+ " # For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe\n",
82
+ " if task.lower() in [\"mini_math_v2\"]:\n",
83
+ " for k, v in data[\"results\"].items():\n",
84
+ " if k != \"all\":\n",
85
+ " level = k.split(\"|\")[1].split(\":\")[-1]\n",
86
+ " value = v[\"qem\"]\n",
87
+ " df.loc[model_revision, f\"{task}_{level}\"] = value\n",
88
+ " # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n",
89
+ " elif task.lower() in [\"aimo_kaggle_medium_pot\"]:\n",
90
+ " for k, v in data[\"results\"].items():\n",
91
+ " if k != \"all\" and \"_average\" not in k:\n",
92
+ " version = k.split(\"|\")[1].split(\":\")[-1]\n",
93
+ " value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n",
94
+ " df.loc[model_revision, f\"{task}_{version}\"] = value\n",
95
+ " # For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe\n",
96
+ " elif task.lower() in [\"aimo_kaggle_hard_pot\"]:\n",
97
+ " for k, v in data[\"results\"].items():\n",
98
+ " if k != \"all\" and \"_average\" not in k:\n",
99
+ " version = k.split(\"|\")[1].split(\":\")[-1]\n",
100
+ " value = v[\"qem\"] if \"qem\" in v else v[\"score\"]\n",
101
+ " df.loc[model_revision, f\"{task}_{version}\"] = value\n",
102
+ " # For kaggle_tora we report accuracy, so need to divide by 100\n",
103
+ " elif task.lower() in [\n",
104
+ " \"aimo_tora_eval_kaggle_medium\",\n",
105
+ " \"aimo_tora_eval_kaggle_hard\",\n",
106
+ " \"aimo_kaggle_fast_eval_hard\",\n",
107
+ " \"aimo_kaggle_tora_medium\",\n",
108
+ " \"aimo_kaggle_tora_hard\",\n",
109
+ " \"aimo_kaggle_tora_medium_extended\",\n",
110
+ " \"aimo_kaggle_tora_hard_extended\",\n",
111
+ " ]:\n",
112
+ " for k, v in data[\"results\"].items():\n",
113
+ " value = float(v[\"qem\"]) / 100.0\n",
114
+ " df.loc[model_revision, f\"{task}\"] = value\n",
115
+ " # For AlpacaEval we report base winrate and lenght corrected one\n",
116
+ " elif task.lower() == \"alpaca_eval\":\n",
117
+ " value = data[\"results\"][first_result_key][\"win_rate\"]\n",
118
+ " df.loc[model_revision, \"Alpaca_eval\"] = value / 100.0\n",
119
+ " value = data[\"results\"][first_result_key][\"length_controlled_winrate\"]\n",
120
+ " df.loc[model_revision, \"Alpaca_eval_lc\"] = value / 100.0\n",
121
+ " else:\n",
122
+ " df.loc[model_revision, task] = float(value)\n",
123
+ "\n",
124
  " # Drop rows where every entry is NaN\n",
125
  " df = df.dropna(how=\"all\", axis=0, subset=[c for c in df.columns if c != \"Date\"])\n",
126
+ "\n",
127
+ " # Trim minimath column names\n",
128
+ " df.columns = [c.replace(\"_level_\", \"_l\") for c in df.columns]\n",
129
+ "\n",
130
+ " # Trim AIMO column names\n",
131
+ " df.columns = [c.replace(\"aimo_\", \"\") for c in df.columns]\n",
132
+ "\n",
133
+ " df.insert(loc=0, column=\"Average\", value=df.mean(axis=1, numeric_only=True))\n",
134
+ "\n",
135
  " # Convert all values to percentage\n",
136
  " df[df.select_dtypes(include=[\"number\"]).columns] *= 100.0\n",
137
  " df = df.sort_values(by=[\"Average\"], ascending=False)\n",
138
  " df = df.reset_index().rename(columns={\"index\": \"Model\"}).round(2)\n",
139
  " # Strip off date from model name\n",
140
  " df[\"Model\"] = df[\"Model\"].apply(lambda x: x.rsplit(\"_\", 1)[0])\n",
141
+ "\n",
142
+ " # Drop date and aggregate results by model name\n",
143
+ " df = df.drop(\"Date\", axis=1).groupby(\"Model\").agg(agg).reset_index()\n",
144
+ "\n",
145
  " return df"
146
  ]
147
  },
148
  {
149
  "cell_type": "code",
150
+ "execution_count": 41,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "df = get_leaderboard_df(agg='mean')"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 37,
160
  "metadata": {},
161
  "outputs": [],
162
  "source": [
163
+ "# df"
164
  ]
165
  },
166
  {
167
  "cell_type": "code",
168
+ "execution_count": 40,
169
  "metadata": {},
170
  "outputs": [
171
  {
 
190
  " <tr style=\"text-align: right;\">\n",
191
  " <th></th>\n",
192
  " <th>Model</th>\n",
 
193
  " <th>Average</th>\n",
194
+ " <th>kaggle_tora_medium_extended</th>\n",
195
+ " <th>kaggle_tora_hard_extended</th>\n",
 
 
 
 
 
196
  " </tr>\n",
197
  " </thead>\n",
198
  " <tbody>\n",
199
  " <tr>\n",
200
+ " <th>1741</th>\n",
201
+ " <td>AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits</td>\n",
202
+ " <td>28.89</td>\n",
203
+ " <td>61.45</td>\n",
204
+ " <td>28.89</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  " </tr>\n",
206
  " </tbody>\n",
207
  "</table>\n",
 
208
  "</div>"
209
  ],
210
  "text/plain": [
211
+ " Model Average \\\n",
212
+ "1741 AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits 28.89 \n",
 
 
 
 
 
 
 
 
 
 
213
  "\n",
214
+ " kaggle_tora_medium_extended kaggle_tora_hard_extended \n",
215
+ "1741 61.45 28.89 "
 
 
 
 
 
 
 
 
 
 
 
 
216
  ]
217
  },
218
+ "execution_count": 40,
219
  "metadata": {},
220
  "output_type": "execute_result"
221
  }
222
  ],
223
  "source": [
224
+ "df.query(\"Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'\").dropna(axis=1, how=\"all\")"
225
  ]
226
  },
227
  {
228
  "cell_type": "code",
229
+ "execution_count": 38,
230
  "metadata": {},
231
  "outputs": [
232
  {
 
251
  " <tr style=\"text-align: right;\">\n",
252
  " <th></th>\n",
253
  " <th>Model</th>\n",
254
+ " <th>Average</th>\n",
255
+ " <th>kaggle_tora_medium_extended</th>\n",
256
+ " <th>kaggle_tora_hard_extended</th>\n",
 
 
 
 
257
  " </tr>\n",
258
  " </thead>\n",
259
  " <tbody>\n",
260
  " <tr>\n",
261
+ " <th>1741</th>\n",
262
+ " <td>AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits</td>\n",
263
+ " <td>65.06</td>\n",
264
+ " <td>65.06</td>\n",
265
+ " <td>32.22</td>\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  " </tr>\n",
267
  " </tbody>\n",
268
  "</table>\n",
 
269
  "</div>"
270
  ],
271
  "text/plain": [
272
+ " Model Average \\\n",
273
+ "1741 AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits 65.06 \n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  "\n",
275
+ " kaggle_tora_medium_extended kaggle_tora_hard_extended \n",
276
+ "1741 65.06 32.22 "
277
  ]
278
  },
279
+ "execution_count": 38,
280
  "metadata": {},
281
  "output_type": "execute_result"
282
  }
283
  ],
284
  "source": [
285
+ "df.query(\"Model == 'AI-MO_deepseek-math-7b-sft_aimo_v38.15.gptq-8bits'\").dropna(axis=1, how=\"all\")"
 
286
  ]
287
  },
288
  {