app.py CHANGED
@@ -1,10 +1,11 @@
1
  import os
2
- import time
3
  import logging
 
4
  import gradio as gr
5
- import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
 
8
  from gradio_space_ci import enable_space_ci
9
 
10
  from src.display.about import (
@@ -49,14 +50,12 @@ from src.submission.submit import add_new_eval
49
  from src.tools.collections import update_collections
50
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
52
-
53
  # Configure logging
54
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
59
-
60
  def restart_space():
61
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
62
 
@@ -143,140 +142,6 @@ def load_and_create_plots():
143
  return plot_df
144
 
145
 
146
- # Searching and filtering
147
- def update_table(
148
- hidden_df: pd.DataFrame,
149
- columns: list,
150
- type_query: list,
151
- precision_query: str,
152
- size_query: list,
153
- hide_models: list,
154
- query: str,
155
- ):
156
- filtered_df = filter_models(
157
- df=hidden_df,
158
- type_query=type_query,
159
- size_query=size_query,
160
- precision_query=precision_query,
161
- hide_models=hide_models,
162
- )
163
- filtered_df = filter_queries(query, filtered_df)
164
- df = select_columns(filtered_df, columns)
165
- return df
166
-
167
-
168
- def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
169
- query = request.query_params.get("query") or ""
170
- return (
171
- query,
172
- query,
173
- ) # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
174
-
175
-
176
- def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
177
- return df[(df[AutoEvalColumn.fullname.name].str.contains(query, case=False, na=False))]
178
-
179
- def search_license(df: pd.DataFrame, query: str) -> pd.DataFrame:
180
- return df[df[AutoEvalColumn.license.name].str.contains(query, case=False, na=False)]
181
-
182
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
183
- always_here_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
184
- dummy_col = [AutoEvalColumn.fullname.name]
185
- filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col]
186
- return filtered_df
187
-
188
- def filter_queries(query: str, df: pd.DataFrame):
189
- tmp_result_df = []
190
-
191
- # Empty query return the same df
192
- if query == "":
193
- return df
194
-
195
- # all_queries = [q.strip() for q in query.split(";")]
196
- # license_queries = []
197
- all_queries = [q.strip() for q in query.split(";") if q.strip() != ""]
198
- model_queries = [q for q in all_queries if not q.startswith("licence")]
199
- license_queries_raw = [q for q in all_queries if q.startswith("license")]
200
- license_queries = [
201
- q.replace("license:", "").strip() for q in license_queries_raw if q.replace("license:", "").strip() != ""
202
- ]
203
-
204
- # Handling model name search
205
- for query in model_queries:
206
- tmp_df = search_model(df, query)
207
- if len(tmp_df) > 0:
208
- tmp_result_df.append(tmp_df)
209
-
210
- if not tmp_result_df and not license_queries:
211
- # Nothing is found, no license_queries -> return empty df
212
- return pd.DataFrame(columns=df.columns)
213
-
214
- if tmp_result_df:
215
- df = pd.concat(tmp_result_df)
216
- df = df.drop_duplicates(
217
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
218
- )
219
-
220
- if not license_queries:
221
- return df
222
-
223
- # Handling license search
224
- tmp_result_df = []
225
- for query in license_queries:
226
- tmp_df = search_license(df, query)
227
- if len(tmp_df) > 0:
228
- tmp_result_df.append(tmp_df)
229
-
230
- if not tmp_result_df:
231
- # Nothing is found, return empty df
232
- return pd.DataFrame(columns=df.columns)
233
-
234
- df = pd.concat(tmp_result_df)
235
- df = df.drop_duplicates(
236
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
237
- )
238
-
239
- return df
240
-
241
-
242
- def filter_models(
243
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, hide_models: list
244
- ) -> pd.DataFrame:
245
- # Show all models
246
- if "Private or deleted" in hide_models:
247
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
248
- else:
249
- filtered_df = df
250
-
251
- if "Contains a merge/moerge" in hide_models:
252
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.merged.name] == False]
253
-
254
- if "MoE" in hide_models:
255
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.moe.name] == False]
256
-
257
- if "Flagged" in hide_models:
258
- filtered_df = filtered_df[filtered_df[AutoEvalColumn.flagged.name] == False]
259
-
260
- type_emoji = [t[0] for t in type_query]
261
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
262
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
263
-
264
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
265
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
266
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
267
- filtered_df = filtered_df.loc[mask]
268
-
269
- return filtered_df
270
-
271
-
272
- leaderboard_df = filter_models(
273
- df=leaderboard_df,
274
- type_query=[t.to_str(" : ") for t in ModelType],
275
- size_query=list(NUMERIC_INTERVALS.keys()),
276
- precision_query=[i.value.name for i in Precision],
277
- hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
278
- )
279
-
280
  demo = gr.Blocks(css=custom_css)
281
  with demo:
282
  gr.HTML(TITLE)
@@ -284,135 +149,40 @@ with demo:
284
 
285
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
286
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
287
- with gr.Row():
288
- with gr.Column():
289
- with gr.Row():
290
- search_bar = gr.Textbox(
291
- placeholder="🔍 Search models or licenses (e.g., 'model_name; license: MIT') and press ENTER...",
292
- show_label=False,
293
- elem_id="search-bar",
294
- )
295
- with gr.Row():
296
- shown_columns = gr.CheckboxGroup(
297
- choices=[
298
- c.name
299
- for c in fields(AutoEvalColumn)
300
- if not c.hidden and not c.never_hidden and not c.dummy
301
- ],
302
- value=[
303
- c.name
304
- for c in fields(AutoEvalColumn)
305
- if c.displayed_by_default and not c.hidden and not c.never_hidden
306
- ],
307
- label="Select columns to show",
308
- elem_id="column-select",
309
- interactive=True,
310
- )
311
- with gr.Row():
312
- hide_models = gr.CheckboxGroup(
313
- label="Hide models",
314
- choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
315
- value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
316
- interactive=True,
317
- )
318
- with gr.Column(min_width=320):
319
- # with gr.Box(elem_id="box-filter"):
320
- filter_columns_type = gr.CheckboxGroup(
321
- label="Model types",
322
- choices=[t.to_str() for t in ModelType],
323
- value=[t.to_str() for t in ModelType],
324
- interactive=True,
325
- elem_id="filter-columns-type",
326
- )
327
- filter_columns_precision = gr.CheckboxGroup(
328
- label="Precision",
329
- choices=[i.value.name for i in Precision],
330
- value=[i.value.name for i in Precision],
331
- interactive=True,
332
- elem_id="filter-columns-precision",
333
- )
334
- filter_columns_size = gr.CheckboxGroup(
335
- label="Model sizes (in billions of parameters)",
336
- choices=list(NUMERIC_INTERVALS.keys()),
337
- value=list(NUMERIC_INTERVALS.keys()),
338
- interactive=True,
339
- elem_id="filter-columns-size",
340
- )
341
-
342
- leaderboard_table = gr.components.Dataframe(
343
- value=leaderboard_df[
344
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
345
- + shown_columns.value
346
- + [AutoEvalColumn.fullname.name]
347
  ],
348
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
349
- datatype=TYPES,
350
- elem_id="leaderboard-table",
351
- interactive=False,
352
- visible=True,
353
- )
354
-
355
- # Dummy leaderboard for handling the case when the user uses backspace key
356
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
357
- value=original_df[COLS],
358
- headers=COLS,
359
- datatype=TYPES,
360
- visible=False,
361
- )
362
- search_bar.submit(
363
- update_table,
364
- [
365
- hidden_leaderboard_table_for_search,
366
- shown_columns,
367
- filter_columns_type,
368
- filter_columns_precision,
369
- filter_columns_size,
370
- hide_models,
371
- search_bar,
372
  ],
373
- leaderboard_table,
374
- )
375
-
376
- # Define a hidden component that will trigger a reload only if a query parameter has been set
377
- hidden_search_bar = gr.Textbox(value="", visible=False)
378
- hidden_search_bar.change(
379
- update_table,
380
- [
381
- hidden_leaderboard_table_for_search,
382
- shown_columns,
383
- filter_columns_type,
384
- filter_columns_precision,
385
- filter_columns_size,
386
- hide_models,
387
- search_bar,
388
  ],
389
- leaderboard_table,
390
  )
391
- # Check query parameter once at startup and update search bar + hidden component
392
- demo.load(load_query, inputs=[], outputs=[search_bar, hidden_search_bar])
393
-
394
- for selector in [
395
- shown_columns,
396
- filter_columns_type,
397
- filter_columns_precision,
398
- filter_columns_size,
399
- hide_models,
400
- ]:
401
- selector.change(
402
- update_table,
403
- [
404
- hidden_leaderboard_table_for_search,
405
- shown_columns,
406
- filter_columns_type,
407
- filter_columns_precision,
408
- filter_columns_size,
409
- hide_models,
410
- search_bar,
411
- ],
412
- leaderboard_table,
413
- queue=True,
414
- )
415
-
416
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
417
  with gr.Row():
418
  with gr.Column():
@@ -543,4 +313,4 @@ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
543
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
544
  scheduler.start()
545
 
546
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import os
2
+ import pandas as pd
3
  import logging
4
+ import time
5
  import gradio as gr
 
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
9
  from gradio_space_ci import enable_space_ci
10
 
11
  from src.display.about import (
 
50
  from src.tools.collections import update_collections
51
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
52
 
 
53
  # Configure logging
54
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
 
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
 
59
  def restart_space():
60
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
61
 
 
142
  return plot_df
143
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  demo = gr.Blocks(css=custom_css)
146
  with demo:
147
  gr.HTML(TITLE)
 
149
 
150
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
151
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
152
+ leaderboard = Leaderboard(
153
+ value=leaderboard_df,
154
+ datatype=[c.type for c in fields(AutoEvalColumn)],
155
+ select_columns=SelectColumns(
156
+ default_selection=[
157
+ c.name
158
+ for c in fields(AutoEvalColumn)
159
+ if c.displayed_by_default
160
+ ],
161
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
162
+ label="Select Columns to Display:",
163
+ ),
164
+ search_columns=[
165
+ AutoEvalColumn.model.name,
166
+ AutoEvalColumn.fullname.name,
167
+ AutoEvalColumn.license.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  ],
169
+ hide_columns=[
170
+ c.name
171
+ for c in fields(AutoEvalColumn)
172
+ if c.hidden
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  ],
174
+ filter_columns=[
175
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
176
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
177
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
178
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True),
179
+ ColumnFilter(AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True),
180
+ ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
181
+ ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
 
 
 
 
 
 
 
182
  ],
183
+ bool_checkboxgroup_label="Hide models"
184
  )
185
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
187
  with gr.Row():
188
  with gr.Column():
 
313
  scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
314
  scheduler.start()
315
 
316
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml CHANGED
@@ -44,9 +44,10 @@ tqdm = "4.65.0"
44
  transformers = "4.40.0"
45
  tokenizers = ">=0.15.0"
46
  gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
47
- gradio = "4.9.0"
48
  isort = "^5.13.2"
49
  ruff = "^0.3.5"
 
50
 
51
  [build-system]
52
  requires = ["poetry-core"]
 
44
  transformers = "4.40.0"
45
  tokenizers = ">=0.15.0"
46
  gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
47
+ gradio = " 4.20.0"
48
  isort = "^5.13.2"
49
  ruff = "^0.3.5"
50
+ gradio-leaderboard = "0.0.8"
51
 
52
  [build-system]
53
  requires = ["poetry-core"]
requirements.txt CHANGED
@@ -13,4 +13,6 @@ sentencepiece
13
  tqdm==4.65.0
14
  transformers==4.40.0
15
  tokenizers>=0.15.0
16
- gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
 
 
 
13
  tqdm==4.65.0
14
  transformers==4.40.0
15
  tokenizers>=0.15.0
16
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
17
+ gradio==4.20.0
18
+ gradio_leaderboard==0.0.8
src/display/utils.py CHANGED
@@ -89,7 +89,7 @@ auto_eval_column_dict.append(
89
  ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
90
  )
91
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
- auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
93
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
94
  # Dummy column for the search bar (hidden by the custom CSS)
95
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
@@ -123,7 +123,7 @@ baseline_row = {
123
  AutoEvalColumn.gsm8k.name: 0.21,
124
  AutoEvalColumn.fullname.name: "baseline",
125
  AutoEvalColumn.model_type.name: "",
126
- AutoEvalColumn.flagged.name: False,
127
  }
128
 
129
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
@@ -148,7 +148,7 @@ human_baseline_row = {
148
  AutoEvalColumn.gsm8k.name: 100,
149
  AutoEvalColumn.fullname.name: "human_baseline",
150
  AutoEvalColumn.model_type.name: "",
151
- AutoEvalColumn.flagged.name: False,
152
  }
153
 
154
 
 
89
  ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)]
90
  )
91
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
+ auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
93
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
94
  # Dummy column for the search bar (hidden by the custom CSS)
95
  auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
 
123
  AutoEvalColumn.gsm8k.name: 0.21,
124
  AutoEvalColumn.fullname.name: "baseline",
125
  AutoEvalColumn.model_type.name: "",
126
+ AutoEvalColumn.not_flagged.name: False,
127
  }
128
 
129
  # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
 
148
  AutoEvalColumn.gsm8k.name: 100,
149
  AutoEvalColumn.fullname.name: "human_baseline",
150
  AutoEvalColumn.model_type.name: "",
151
+ AutoEvalColumn.not_flagged.name: False,
152
  }
153
 
154
 
src/leaderboard/filter_models.py CHANGED
@@ -133,11 +133,14 @@ DO_NOT_SUBMIT_MODELS = [
133
  def flag_models(leaderboard_data: list[dict]):
134
  """Flags models based on external criteria or flagged status."""
135
  for model_data in leaderboard_data:
136
- # Merges and moes are flagged automatically
137
- if model_data[AutoEvalColumn.flagged.name]:
138
- flag_key = "merged"
139
- else:
140
  flag_key = model_data[AutoEvalColumn.fullname.name]
 
 
 
 
 
141
  if flag_key in FLAGGED_MODELS:
142
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
143
  issue_link = model_hyperlink(
@@ -147,9 +150,9 @@ def flag_models(leaderboard_data: list[dict]):
147
  model_data[AutoEvalColumn.model.name] = (
148
  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
149
  )
150
- model_data[AutoEvalColumn.flagged.name] = True
151
  else:
152
- model_data[AutoEvalColumn.flagged.name] = False
153
 
154
 
155
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
133
  def flag_models(leaderboard_data: list[dict]):
134
  """Flags models based on external criteria or flagged status."""
135
  for model_data in leaderboard_data:
136
+ # If a model is not flagged, use its "fullname" as a key
137
+ if model_data[AutoEvalColumn.not_flagged.name]:
 
 
138
  flag_key = model_data[AutoEvalColumn.fullname.name]
139
+ else:
140
+ # Merges and moes are flagged
141
+ flag_key = "merged"
142
+
143
+ # Reverse the logic: Check for non-flagged models instead
144
  if flag_key in FLAGGED_MODELS:
145
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
146
  issue_link = model_hyperlink(
 
150
  model_data[AutoEvalColumn.model.name] = (
151
  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
152
  )
153
+ model_data[AutoEvalColumn.not_flagged.name] = False
154
  else:
155
+ model_data[AutoEvalColumn.not_flagged.name] = True
156
 
157
 
158
  def remove_forbidden_models(leaderboard_data: list[dict]):
src/leaderboard/read_evals.py CHANGED
@@ -37,7 +37,7 @@ class EvalResult:
37
  date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
- flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
@@ -164,7 +164,7 @@ class EvalResult:
164
  self.tags = file_dict.get("tags", [])
165
 
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
- self.flagged = "flagged" in self.tags
168
 
169
 
170
  def to_dict(self):
@@ -185,9 +185,9 @@ class EvalResult:
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
- AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
189
- AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
190
- AutoEvalColumn.flagged.name: self.flagged,
191
  }
192
 
193
  for task in Tasks:
 
37
  date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
+ not_flagged: bool = False
41
  status: str = "FINISHED"
42
  # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
  tags: List[str] = field(default_factory=list)
 
164
  self.tags = file_dict.get("tags", [])
165
 
166
  # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
+ self.not_flagged = not (any("flagged" in tag for tag in self.tags))
168
 
169
 
170
  def to_dict(self):
 
185
  AutoEvalColumn.likes.name: self.likes,
186
  AutoEvalColumn.params.name: self.num_params,
187
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
188
+ AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
189
+ AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
190
+ AutoEvalColumn.not_flagged.name: self.not_flagged,
191
  }
192
 
193
  for task in Tasks:
src/submission/check_validity.py CHANGED
@@ -170,7 +170,6 @@ def get_model_tags(model_card, model: str):
170
  is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
171
  # Hardcoding because of gating problem
172
  if "Qwen/Qwen1.5-32B" in model:
173
- print("HERE NSHJNKJSNJLAS")
174
  is_moe_from_model_card = False
175
  is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
176
  if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
 
170
  is_moe_from_model_card = any(keyword in model_card.text.lower() for keyword in ["moe", "mixtral"])
171
  # Hardcoding because of gating problem
172
  if "Qwen/Qwen1.5-32B" in model:
 
173
  is_moe_from_model_card = False
174
  is_moe_from_name = "moe" in model.lower().replace("/", "-").replace("_", "-").split("-")
175
  if is_moe_from_model_card or is_moe_from_name or is_moe_from_metadata:
src/tools/plots.py CHANGED
@@ -34,7 +34,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
34
  # We ignore models that are flagged/no longer on the hub/not finished
35
  to_ignore = (
36
  not row["still_on_hub"]
37
- or row["flagged"]
38
  or current_model in FLAGGED_MODELS
39
  or row["status"] != "FINISHED"
40
  )
@@ -68,7 +68,6 @@ def create_plot_df(scores_df: dict[str : pd.DataFrame]) -> pd.DataFrame:
68
  """
69
  # Initialize the list to store DataFrames
70
  dfs = []
71
-
72
  # Iterate over the cols and create a new DataFrame for each column
73
  for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
74
  d = scores_df[col].reset_index(drop=True)
 
34
  # We ignore models that are flagged/no longer on the hub/not finished
35
  to_ignore = (
36
  not row["still_on_hub"]
37
+ or not row["not_flagged"]
38
  or current_model in FLAGGED_MODELS
39
  or row["status"] != "FINISHED"
40
  )
 
68
  """
69
  # Initialize the list to store DataFrames
70
  dfs = []
 
71
  # Iterate over the cols and create a new DataFrame for each column
72
  for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
73
  d = scores_df[col].reset_index(drop=True)