“WadoodAbdul” commited on
Commit
cc05af6
1 Parent(s): 9f7ed19

intermediate commit

Browse files
Files changed (5) hide show
  1. app.py +169 -71
  2. src/about.py +14 -7
  3. src/display/utils.py +41 -26
  4. src/envs.py +5 -5
  5. src/leaderboard/read_evals.py +22 -21
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import subprocess
 
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -22,9 +23,9 @@ from src.display.utils import (
22
  TYPES,
23
  AutoEvalColumn,
24
  ModelType,
25
- fields,
26
  WeightType,
27
- Precision
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -34,20 +35,21 @@ from src.submission.submit import add_new_eval
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
37
- try:
38
- print(EVAL_REQUESTS_PATH)
39
- snapshot_download(
40
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
- )
42
- except Exception:
43
- restart_space()
44
- try:
45
- print(EVAL_RESULTS_PATH)
46
- snapshot_download(
47
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
- )
49
- except Exception:
50
- restart_space()
 
51
 
52
 
53
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
@@ -64,11 +66,11 @@ leaderboard_df = original_df.copy()
64
  def update_table(
65
  hidden_df: pd.DataFrame,
66
  columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
  query: str,
 
 
 
 
72
  ):
73
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
  filtered_df = filter_queries(query, filtered_df)
@@ -86,9 +88,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
86
  AutoEvalColumn.model.name,
87
  ]
88
  # We use COLS to maintain sorting
89
- filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
- ]
92
  return filtered_df
93
 
94
 
@@ -105,7 +105,11 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
105
  if len(final_df) > 0:
106
  filtered_df = pd.concat(final_df)
107
  filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
 
 
 
 
109
  )
110
 
111
  return filtered_df
@@ -115,19 +119,26 @@ def filter_models(
115
  df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
  ) -> pd.DataFrame:
117
  # Show all models
118
- if show_deleted:
119
- filtered_df = df
120
- else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
 
 
122
 
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
 
127
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
- filtered_df = filtered_df.loc[mask]
 
 
 
 
 
131
 
132
  return filtered_df
133
 
@@ -138,7 +149,7 @@ with demo:
138
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
139
 
140
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
  with gr.Row():
143
  with gr.Column():
144
  with gr.Row():
@@ -149,11 +160,101 @@ with demo:
149
  )
150
  with gr.Row():
151
  shown_columns = gr.CheckboxGroup(
152
- choices=[
 
153
  c.name
154
  for c in fields(AutoEvalColumn)
155
- if not c.hidden and not c.never_hidden
156
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  value=[
158
  c.name
159
  for c in fields(AutoEvalColumn)
@@ -163,12 +264,12 @@ with demo:
163
  elem_id="column-select",
164
  interactive=True,
165
  )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
  with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
  filter_columns_type = gr.CheckboxGroup(
173
  label="Model types",
174
  choices=[t.to_str() for t in ModelType],
@@ -176,26 +277,23 @@ with demo:
176
  interactive=True,
177
  elem_id="filter-columns-type",
178
  )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
 
194
  leaderboard_table = gr.components.Dataframe(
195
- value=leaderboard_df[
196
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
- + shown_columns.value
198
- ],
199
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
201
  elem_id="leaderboard-table",
@@ -215,25 +313,25 @@ with demo:
215
  [
216
  hidden_leaderboard_table_for_search,
217
  shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
- deleted_models_visibility,
222
  search_bar,
 
223
  ],
224
  leaderboard_table,
225
  )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
 
 
 
 
 
 
227
  selector.change(
228
  update_table,
229
  [
230
  hidden_leaderboard_table_for_search,
231
  shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
- deleted_models_visibility,
236
  search_bar,
 
237
  ],
238
  leaderboard_table,
239
  queue=True,
@@ -342,4 +440,4 @@ with demo:
342
  scheduler = BackgroundScheduler()
343
  scheduler.add_job(restart_space, "interval", seconds=1800)
344
  scheduler.start()
345
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import subprocess
2
+
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
23
  TYPES,
24
  AutoEvalColumn,
25
  ModelType,
26
+ Precision,
27
  WeightType,
28
+ fields,
29
  )
30
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
31
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
35
  def restart_space():
36
  API.restart_space(repo_id=REPO_ID)
37
 
38
+
39
+ # try:
40
+ # print(EVAL_REQUESTS_PATH)
41
+ # snapshot_download(
42
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
43
+ # )
44
+ # except Exception:
45
+ # restart_space()
46
+ # try:
47
+ # print(EVAL_RESULTS_PATH)
48
+ # snapshot_download(
49
+ # repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
50
+ # )
51
+ # except Exception:
52
+ # restart_space()
53
 
54
 
55
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
66
  def update_table(
67
  hidden_df: pd.DataFrame,
68
  columns: list,
 
 
 
 
69
  query: str,
70
+ type_query: list = None,
71
+ precision_query: str = None,
72
+ size_query: list = None,
73
+ show_deleted: bool = False,
74
  ):
75
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
76
  filtered_df = filter_queries(query, filtered_df)
 
88
  AutoEvalColumn.model.name,
89
  ]
90
  # We use COLS to maintain sorting
91
+ filtered_df = df[always_here_cols + [c for c in COLS if c in df.columns and c in columns]]
 
 
92
  return filtered_df
93
 
94
 
 
105
  if len(final_df) > 0:
106
  filtered_df = pd.concat(final_df)
107
  filtered_df = filtered_df.drop_duplicates(
108
+ subset=[
109
+ AutoEvalColumn.model.name,
110
+ # AutoEvalColumn.precision.name,
111
+ # AutoEvalColumn.revision.name,
112
+ ]
113
  )
114
 
115
  return filtered_df
 
119
  df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
120
  ) -> pd.DataFrame:
121
  # Show all models
122
+ # if show_deleted:
123
+ # filtered_df = df
124
+ # else: # Show only still on the hub models
125
+ # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
126
+
127
+ filtered_df = df
128
 
129
+ if type_query is not None:
130
+ type_emoji = [t[0] for t in type_query]
131
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
132
 
133
+ if precision_query is not None:
134
+ if AutoEvalColumn.precision.name in df.columns:
135
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
136
+
137
+ if size_query is not None:
138
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
139
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
140
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
141
+ filtered_df = filtered_df.loc[mask]
142
 
143
  return filtered_df
144
 
 
149
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
150
 
151
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
152
+ with gr.TabItem("🏅 NER Datasets", elem_id="llm-benchmark-tab-table", id=0):
153
  with gr.Row():
154
  with gr.Column():
155
  with gr.Row():
 
160
  )
161
  with gr.Row():
162
  shown_columns = gr.CheckboxGroup(
163
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
164
+ value=[
165
  c.name
166
  for c in fields(AutoEvalColumn)
167
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
168
  ],
169
+ label="Select columns to show",
170
+ elem_id="column-select",
171
+ interactive=True,
172
+ )
173
+ # with gr.Row():
174
+ # deleted_models_visibility = gr.Checkbox(
175
+ # value=False, label="Show gated/private/deleted models", interactive=True
176
+ # )
177
+ with gr.Column(min_width=320):
178
+ # with gr.Box(elem_id="box-filter"):
179
+ filter_columns_type = gr.CheckboxGroup(
180
+ label="Model types",
181
+ choices=[t.to_str() for t in ModelType],
182
+ value=[t.to_str() for t in ModelType],
183
+ interactive=True,
184
+ elem_id="filter-columns-type",
185
+ )
186
+ # filter_columns_precision = gr.CheckboxGroup(
187
+ # label="Precision",
188
+ # choices=[i.value.name for i in Precision],
189
+ # value=[i.value.name for i in Precision],
190
+ # interactive=True,
191
+ # elem_id="filter-columns-precision",
192
+ # )
193
+ # filter_columns_size = gr.CheckboxGroup(
194
+ # label="Model sizes (in billions of parameters)",
195
+ # choices=list(NUMERIC_INTERVALS.keys()),
196
+ # value=list(NUMERIC_INTERVALS.keys()),
197
+ # interactive=True,
198
+ # elem_id="filter-columns-size",
199
+ # )
200
+
201
+ leaderboard_table = gr.components.Dataframe(
202
+ value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
203
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
204
+ datatype=TYPES,
205
+ elem_id="leaderboard-table",
206
+ interactive=False,
207
+ visible=True,
208
+ )
209
+
210
+ # Dummy leaderboard for handling the case when the user uses backspace key
211
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
212
+ value=original_df[COLS],
213
+ headers=COLS,
214
+ datatype=TYPES,
215
+ visible=False,
216
+ )
217
+ search_bar.submit(
218
+ update_table,
219
+ [
220
+ hidden_leaderboard_table_for_search,
221
+ shown_columns,
222
+ search_bar,
223
+ filter_columns_type,
224
+ ],
225
+ leaderboard_table,
226
+ )
227
+ for selector in [
228
+ shown_columns,
229
+ filter_columns_type,
230
+ # filter_columns_precision,
231
+ # filter_columns_size,
232
+ # deleted_models_visibility,
233
+ ]:
234
+ selector.change(
235
+ update_table,
236
+ [
237
+ hidden_leaderboard_table_for_search,
238
+ shown_columns,
239
+ search_bar,
240
+ filter_columns_type,
241
+ ],
242
+ leaderboard_table,
243
+ queue=True,
244
+ )
245
+
246
+ with gr.TabItem("🏅 M2 Types", elem_id="llm-benchmark-tab-table", id=4):
247
+ with gr.Row():
248
+ with gr.Column():
249
+ with gr.Row():
250
+ search_bar = gr.Textbox(
251
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
252
+ show_label=False,
253
+ elem_id="search-bar",
254
+ )
255
+ with gr.Row():
256
+ shown_columns = gr.CheckboxGroup(
257
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
258
  value=[
259
  c.name
260
  for c in fields(AutoEvalColumn)
 
264
  elem_id="column-select",
265
  interactive=True,
266
  )
267
+ # with gr.Row():
268
+ # deleted_models_visibility = gr.Checkbox(
269
+ # value=False, label="Show gated/private/deleted models", interactive=True
270
+ # )
271
  with gr.Column(min_width=320):
272
+ # with gr.Box(elem_id="box-filter"):
273
  filter_columns_type = gr.CheckboxGroup(
274
  label="Model types",
275
  choices=[t.to_str() for t in ModelType],
 
277
  interactive=True,
278
  elem_id="filter-columns-type",
279
  )
280
+ # filter_columns_precision = gr.CheckboxGroup(
281
+ # label="Precision",
282
+ # choices=[i.value.name for i in Precision],
283
+ # value=[i.value.name for i in Precision],
284
+ # interactive=True,
285
+ # elem_id="filter-columns-precision",
286
+ # )
287
+ # filter_columns_size = gr.CheckboxGroup(
288
+ # label="Model sizes (in billions of parameters)",
289
+ # choices=list(NUMERIC_INTERVALS.keys()),
290
+ # value=list(NUMERIC_INTERVALS.keys()),
291
+ # interactive=True,
292
+ # elem_id="filter-columns-size",
293
+ # )
294
 
295
  leaderboard_table = gr.components.Dataframe(
296
+ value=leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
 
 
 
297
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
298
  datatype=TYPES,
299
  elem_id="leaderboard-table",
 
313
  [
314
  hidden_leaderboard_table_for_search,
315
  shown_columns,
 
 
 
 
316
  search_bar,
317
+ filter_columns_type,
318
  ],
319
  leaderboard_table,
320
  )
321
+ for selector in [
322
+ shown_columns,
323
+ filter_columns_type,
324
+ # filter_columns_precision,
325
+ # filter_columns_size,
326
+ # deleted_models_visibility,
327
+ ]:
328
  selector.change(
329
  update_table,
330
  [
331
  hidden_leaderboard_table_for_search,
332
  shown_columns,
 
 
 
 
333
  search_bar,
334
+ filter_columns_type,
335
  ],
336
  leaderboard_table,
337
  queue=True,
 
440
  scheduler = BackgroundScheduler()
441
  scheduler.add_job(restart_space, "interval", seconds=1800)
442
  scheduler.start()
443
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,17 +12,23 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ # task0 = Task("anli_r1", "acc", "ANLI")
17
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
18
+ task0 = Task("ncbi", "f1", "NCBI")
19
+ task1 = Task("bc5cdr", "f1", "BC5CD")
20
+ task3 = Task("chia", "f1", "CHIA")
21
+ task4 = Task("biored", "f1", "BIORED")
22
+ # task5 = Task("", "f1", "")
23
+ # task6 = Task("", "f1", "")
24
+
25
+
26
+ NUM_FEWSHOT = 0 # Change with your few shot
27
  # ---------------------------------------------------
28
 
29
 
 
30
  # Your leaderboard name
31
+ TITLE = """<h1 align="center" id="space-title">BioMed NER Leaderboard</h1>"""
32
 
33
  # What does your leaderboard evaluate?
34
  INTRODUCTION_TEXT = """
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,29 +21,33 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,19 +58,22 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
 
 
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -73,28 +81,34 @@ class ModelType(Enum):
73
 
74
  @staticmethod
75
  def from_str(type):
76
- if "fine-tuned" in type or "🔶" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟢" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "" in type:
83
- return ModelType.IFT
 
 
 
 
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
  float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -104,14 +118,15 @@ class Precision(Enum):
104
  return Precision.bfloat16
105
  if precision in ["float32"]:
106
  return Precision.float32
107
- #if precision in ["8bit"]:
108
  # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
  # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
  # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
32
  for task in Tasks:
33
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False)])
34
  # Model information
35
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
37
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
38
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, True)])
39
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
40
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
41
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
42
+ auto_eval_column_dict.append(
43
+ ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)]
44
+ )
45
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
46
 
47
  # We use make dataclass to dynamically fill the scores from Tasks
48
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
49
 
50
+
51
  ## For the queue columns in the submission tab
52
  @dataclass(frozen=True)
53
  class EvalQueueColumn: # Queue column
 
58
  weight_type = ColumnContent("weight_type", "str", "Original")
59
  status = ColumnContent("status", "str", True)
60
 
61
+
62
  ## All the model information that we might need
63
  @dataclass
64
  class ModelDetails:
65
  name: str
66
  display_name: str = ""
67
+ symbol: str = "" # emoji
68
 
69
 
70
  class ModelType(Enum):
71
+ ZEROSHOT = ModelDetails(name="zero-shot", symbol="")
72
+ FINETUNED = ModelDetails(name="fine-tuned", symbol="")
73
+ # PT = ModelDetails(name="pretrained", symbol="🟢")
74
+ # FT = ModelDetails(name="fine-tuned", symbol="🔶")
75
+ # IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
76
+ # RL = ModelDetails(name="RL-tuned", symbol="🟦")
77
  Unknown = ModelDetails(name="", symbol="?")
78
 
79
  def to_str(self, separator=" "):
 
81
 
82
  @staticmethod
83
  def from_str(type):
84
+ if "zero-shot" in type or "" in type:
85
+ return ModelType.ZEROSHOT
86
+ if "fine-tuned" in type or "" in type:
87
+ return ModelType.FINETUNED
88
+ # if "fine-tuned" in type or "🔶" in type:
89
+ # return ModelType.FT
90
+ # if "pretrained" in type or "🟢" in type:
91
+ # return ModelType.PT
92
+ # if "RL-tuned" in type or "🟦" in type:
93
+ # return ModelType.RL
94
+ # if "instruction-tuned" in type or "⭕" in type:
95
+ # return ModelType.IFT
96
  return ModelType.Unknown
97
 
98
+
99
  class WeightType(Enum):
100
  Adapter = ModelDetails("Adapter")
101
  Original = ModelDetails("Original")
102
  Delta = ModelDetails("Delta")
103
 
104
+
105
  class Precision(Enum):
106
  float16 = ModelDetails("float16")
107
  bfloat16 = ModelDetails("bfloat16")
108
  float32 = ModelDetails("float32")
109
+ # qt_8bit = ModelDetails("8bit")
110
+ # qt_4bit = ModelDetails("4bit")
111
+ # qt_GPTQ = ModelDetails("GPTQ")
112
  Unknown = ModelDetails("?")
113
 
114
  def from_str(precision):
 
118
  return Precision.bfloat16
119
  if precision in ["float32"]:
120
  return Precision.float32
121
+ # if precision in ["8bit"]:
122
  # return Precision.qt_8bit
123
+ # if precision in ["4bit"]:
124
  # return Precision.qt_4bit
125
+ # if precision in ["GPTQ", "None"]:
126
  # return Precision.qt_GPTQ
127
  return Precision.Unknown
128
 
129
+
130
  # Column selection
131
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
132
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
src/envs.py CHANGED
@@ -4,17 +4,17 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "m42-health" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/ner_leaderboard_requests"
14
+ RESULTS_REPO = f"{OWNER}/ner_leaderboard_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -8,28 +8,28 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -76,7 +76,7 @@ class EvalResult:
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -85,10 +85,10 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -104,8 +104,12 @@ class EvalResult:
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
 
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -146,10 +150,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
@@ -188,7 +189,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
+ date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
+ mean_acc = np.mean(accs) # * 100.0
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
 
85
  org=org,
86
  model=model,
87
  results=results,
88
+ precision=precision,
89
+ revision=config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
+ architecture=architecture,
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
 
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
+ # self.precision = request.get("precision", "float32")
108
  except Exception:
109
+ print(
110
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
111
+ )
112
+ print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
113
 
114
  def to_dict(self):
115
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
150
  for tmp_request_file in request_files:
151
  with open(tmp_request_file, "r") as f:
152
  req_content = json.load(f)
153
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
154
  request_file = tmp_request_file
155
  return request_file
156
 
 
189
  results = []
190
  for v in eval_results.values():
191
  try:
192
+ v.to_dict() # we test if the dict version is complete
193
  results.append(v)
194
  except KeyError: # not all eval values present
195
  continue