tathagataraha commited on
Commit
d8147b8
Β·
1 Parent(s): e1cdc4b

[FIX] Filters and search

Browse files
Files changed (4) hide show
  1. app.py +67 -47
  2. src/display/utils.py +23 -14
  3. src/leaderboard/read_evals.py +17 -10
  4. src/populate.py +1 -1
app.py CHANGED
@@ -106,14 +106,14 @@ def update_df(shown_columns, subset="datasets"):
106
  def update_table(
107
  hidden_df: pd.DataFrame,
108
  columns: list,
109
- query: str,
110
  type_query: list = None,
111
- architecture_query: list = None,
112
  size_query: list = None,
113
  precision_query: str = None,
114
  show_deleted: bool = False,
115
  ):
116
- filtered_df = filter_models(hidden_df, type_query, architecture_query, size_query, precision_query, show_deleted)
117
  filtered_df = filter_queries(query, filtered_df)
118
  df = select_columns(filtered_df, columns, list(hidden_df.columns))
119
  return df
@@ -157,7 +157,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
157
 
158
 
159
  def filter_models(
160
- df: pd.DataFrame, type_query: list, architecture_query: list, size_query: list, precision_query: list, show_deleted: bool
161
  ) -> pd.DataFrame:
162
  # Show all models
163
  # if show_deleted:
@@ -168,13 +168,21 @@ def filter_models(
168
  filtered_df = df
169
 
170
  if type_query is not None:
171
- type_emoji = [t[0] for t in type_query]
172
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
173
-
174
- if architecture_query is not None:
175
- arch_types = [t for t in architecture_query]
176
- filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
177
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
 
 
 
 
 
 
 
 
178
 
179
  if precision_query is not None:
180
  if AutoEvalColumn.precision.name in df.columns:
@@ -291,6 +299,13 @@ with demo:
291
  # interactive=True,
292
  # elem_id="filter-columns-architecture",
293
  # )
 
 
 
 
 
 
 
294
  filter_columns_size = gr.CheckboxGroup(
295
  label="Model sizes (in billions of parameters)",
296
  choices=list(NUMERIC_INTERVALS.keys()),
@@ -311,44 +326,49 @@ with demo:
311
  )
312
 
313
  # Dummy leaderboard for handling the case when the user uses backspace key
314
- # hidden_leaderboard_table_for_search = gr.components.Dataframe(
315
- # value=datasets_original_df[DATASET_COLS],
316
- # headers=DATASET_COLS,
317
- # datatype=TYPES,
318
- # visible=False,
319
- # )
320
 
321
 
322
- # search_bar.submit(
323
- # update_table,
324
- # [
325
- # hidden_leaderboard_table_for_search,
326
- # shown_columns,
327
- # search_bar,
328
- # filter_columns_type,
329
- # # filter_columns_architecture
330
- # ],
331
- # leaderboard_table,
332
- # )
333
- # for selector in [
334
- # shown_columns,
335
- # filter_columns_type,
336
- # # filter_columns_architecture,
337
- # # filter_columns_size,
338
- # # deleted_models_visibility,
339
- # ]:
340
- # selector.change(
341
- # update_table,
342
- # [
343
- # hidden_leaderboard_table_for_search,
344
- # shown_columns,
345
- # search_bar,
346
- # filter_columns_type,
347
- # # filter_columns_architecture,
348
- # ],
349
- # leaderboard_table,
350
- # queue=True,
351
- # )
 
 
 
 
 
352
 
353
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
354
  pass
 
106
  def update_table(
107
  hidden_df: pd.DataFrame,
108
  columns: list,
109
+ query: str = "",
110
  type_query: list = None,
111
+ domain_specific_query: list = None,
112
  size_query: list = None,
113
  precision_query: str = None,
114
  show_deleted: bool = False,
115
  ):
116
+ filtered_df = filter_models(hidden_df, type_query, domain_specific_query, size_query, precision_query, show_deleted)
117
  filtered_df = filter_queries(query, filtered_df)
118
  df = select_columns(filtered_df, columns, list(hidden_df.columns))
119
  return df
 
157
 
158
 
159
  def filter_models(
160
+ df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
161
  ) -> pd.DataFrame:
162
  # Show all models
163
  # if show_deleted:
 
168
  filtered_df = df
169
 
170
  if type_query is not None:
171
+ type_name = [t.split(" ")[1] for t in type_query]
172
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)]
173
+
174
+ if domain_specific_query is not None:
175
+ domain_specifics = []
176
+ if "Yes" in domain_specific_query:
177
+ domain_specifics.append(True)
178
+ if "No" in domain_specific_query:
179
+ domain_specifics.append(False)
180
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
181
+
182
+ # if architecture_query is not None:
183
+ # arch_types = [t for t in architecture_query]
184
+ # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
185
+ # # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
186
 
187
  if precision_query is not None:
188
  if AutoEvalColumn.precision.name in df.columns:
 
299
  # interactive=True,
300
  # elem_id="filter-columns-architecture",
301
  # )
302
+ filter_domain_specific = gr.CheckboxGroup(
303
+ label="Domain specific models",
304
+ choices=["Yes", "No"],
305
+ value=["Yes", "No"],
306
+ interactive=True,
307
+ elem_id="filter-columns-type",
308
+ )
309
  filter_columns_size = gr.CheckboxGroup(
310
  label="Model sizes (in billions of parameters)",
311
  choices=list(NUMERIC_INTERVALS.keys()),
 
326
  )
327
 
328
  # Dummy leaderboard for handling the case when the user uses backspace key
329
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
330
+ value=datasets_original_df[DATASET_COLS],
331
+ headers=DATASET_COLS,
332
+ datatype=TYPES,
333
+ visible=False,
334
+ )
335
 
336
 
337
+ search_bar.submit(
338
+ update_table,
339
+ [
340
+ hidden_leaderboard_table_for_search,
341
+ shown_columns,
342
+ search_bar,
343
+ filter_columns_type,
344
+ filter_domain_specific,
345
+ filter_columns_size
346
+ # filter_columns_architecture
347
+ ],
348
+ leaderboard_table,
349
+ )
350
+ for selector in [
351
+ shown_columns,
352
+ filter_columns_type,
353
+ filter_domain_specific,
354
+ # filter_columns_architecture,
355
+ filter_columns_size,
356
+ # deleted_models_visibility,
357
+ ]:
358
+ selector.change(
359
+ update_table,
360
+ [
361
+ hidden_leaderboard_table_for_search,
362
+ shown_columns,
363
+ search_bar,
364
+ filter_columns_type,
365
+ filter_domain_specific,
366
+ filter_columns_size
367
+ # filter_columns_architecture,
368
+ ],
369
+ leaderboard_table,
370
+ queue=True,
371
+ )
372
 
373
  with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
374
  pass
src/display/utils.py CHANGED
@@ -28,25 +28,25 @@ class ColumnContent:
28
  ## Leaderboard columns
29
  auto_eval_column_dict = []
30
  # Init
 
31
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
33
- # Scores
34
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
35
  for task in HarnessTasks:
36
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
37
- # Model information
 
 
38
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
39
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
- auto_eval_column_dict.append(["backbone", ColumnContent, ColumnContent("Base Model", "str", False)])
41
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
42
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False, True)])
43
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
 
44
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
45
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False, True)])
46
- auto_eval_column_dict.append(
47
- ["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)]
48
- )
49
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
50
 
51
  # We use make dataclass to dynamically fill the scores from Tasks
52
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -77,7 +77,7 @@ class ModelType(Enum):
77
  # FINETUNED = ModelDetails(name="fine-tuned", symbol="βšͺ")
78
  PT = ModelDetails(name="pretrained", symbol="🟒")
79
  # FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
80
- # DS = ModelDetails(name="domain-specific", symbol="βž•")
81
  IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
82
  RL = ModelDetails(name="preference-tuned", symbol="🟦")
83
  Unknown = ModelDetails(name="", symbol="?")
@@ -99,7 +99,7 @@ class ModelType(Enum):
99
  return ModelType.RL
100
  if "instruction-tuned" in type or "β­•" in type:
101
  return ModelType.IFT
102
- # if "domain-specific" in type or "βž•" in type:
103
  # return ModelType.DS
104
  return ModelType.Unknown
105
 
@@ -129,7 +129,16 @@ class WeightType(Enum):
129
  Adapter = ModelDetails("Adapter")
130
  Original = ModelDetails("Original")
131
  Delta = ModelDetails("Delta")
132
-
 
 
 
 
 
 
 
 
 
133
 
134
  class Precision(Enum):
135
  auto = ModelDetails("auto")
 
28
  ## Leaderboard columns
29
  auto_eval_column_dict = []
30
  # Init
31
+ auto_eval_column_dict = []
32
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
33
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
34
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
35
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
36
  for task in HarnessTasks:
37
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
38
+ auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
39
+ auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
40
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
42
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False)])
43
+ # auto_eval_column_dict.append(["backbone", ColumnContent, ColumnContent("Base Model", "str", False)])
 
 
44
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
45
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
46
  auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
47
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)])
48
+ # auto_eval_column_dict.append(["display_result", ColumnContent, ColumnContent("Display Result", "bool", False, True)])
49
+ auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("Submission Date", "str", False)])
 
 
50
 
51
  # We use make dataclass to dynamically fill the scores from Tasks
52
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
77
  # FINETUNED = ModelDetails(name="fine-tuned", symbol="βšͺ")
78
  PT = ModelDetails(name="pretrained", symbol="🟒")
79
  # FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
80
+ # DS = ModelDetails(name="domain-specific", symbol="πŸ₯")
81
  IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
82
  RL = ModelDetails(name="preference-tuned", symbol="🟦")
83
  Unknown = ModelDetails(name="", symbol="?")
 
99
  return ModelType.RL
100
  if "instruction-tuned" in type or "β­•" in type:
101
  return ModelType.IFT
102
+ # if "domain-specific" in type or "πŸ₯" in type:
103
  # return ModelType.DS
104
  return ModelType.Unknown
105
 
 
129
  Adapter = ModelDetails("Adapter")
130
  Original = ModelDetails("Original")
131
  Delta = ModelDetails("Delta")
132
+ Unknown = ModelDetails("?")
133
+
134
+ def from_str(wt):
135
+ if "original" in wt.lower():
136
+ return WeightType.Original
137
+ if "adapter" in wt.lower():
138
+ return WeightType.Adapter
139
+ if "delta" in wt.lower():
140
+ return WeightType.Delta
141
+ return WeightType.Unknown
142
 
143
  class Precision(Enum):
144
  auto = ModelDetails("auto")
src/leaderboard/read_evals.py CHANGED
@@ -22,11 +22,12 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
 
 
25
  # clinical_type_results:dict
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
29
- architecture: str = "Unknown"
30
  backbone:str = "Unknown"
31
  license: str = "?"
32
  likes: int = 0
@@ -104,17 +105,20 @@ class EvalResult:
104
  full_model=full_model,
105
  org=org,
106
  model=model,
 
107
  dataset_results=dataset_results,
108
- # clinical_type_results=types_results,
 
109
  precision=precision,
110
- revision=config.get("revision", ""),
111
- still_on_hub=still_on_hub,
112
- # architecture=model_architecture,
113
- backbone=backbone,
114
  model_type=model_type,
115
- num_params=num_params,
 
116
  license=license,
117
- display_result=display_result
 
 
 
 
118
  )
119
 
120
  def update_with_request_file(self, requests_path):
@@ -146,17 +150,20 @@ class EvalResult:
146
  "eval_name": self.eval_name, # not a column, just a save name,
147
  AutoEvalColumn.precision.name: self.precision.value.name,
148
  AutoEvalColumn.model_type.name: self.model_type.value.name,
149
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
150
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
151
  # AutoEvalColumn.architecture.name: self.architecture.value.name,
152
- AutoEvalColumn.backbone.name: self.backbone,
153
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
 
154
  AutoEvalColumn.revision.name: self.revision,
155
  AutoEvalColumn.average.name: average,
156
  AutoEvalColumn.license.name: self.license,
157
  AutoEvalColumn.likes.name: self.likes,
158
  AutoEvalColumn.params.name: self.num_params,
159
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
160
  "display_result" : self.display_result,
161
  }
162
 
 
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
25
+ is_domain_specific: bool
26
+ use_chat_template: bool
27
  # clinical_type_results:dict
28
  precision: Precision = Precision.Unknown
29
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
30
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
31
  backbone:str = "Unknown"
32
  license: str = "?"
33
  likes: int = 0
 
105
  full_model=full_model,
106
  org=org,
107
  model=model,
108
+ revision=config.get("revision", ""),
109
  dataset_results=dataset_results,
110
+ is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
111
+ use_chat_template=config.get("use_chat_template", False), # Assuming a default value
112
  precision=precision,
 
 
 
 
113
  model_type=model_type,
114
+ weight_type=WeightType.from_str(config.get("weight_type", "")), # Assuming the default value
115
+ backbone=backbone,
116
  license=license,
117
+ likes=config.get("likes", 0), # Assuming a default value
118
+ num_params=num_params,
119
+ still_on_hub=still_on_hub,
120
+ display_result=display_result,
121
+ date=config.get("submitted_time","")
122
  )
123
 
124
  def update_with_request_file(self, requests_path):
 
150
  "eval_name": self.eval_name, # not a column, just a save name,
151
  AutoEvalColumn.precision.name: self.precision.value.name,
152
  AutoEvalColumn.model_type.name: self.model_type.value.name,
153
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol + (" πŸ₯" if self.is_domain_specific else ""),
154
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
155
  # AutoEvalColumn.architecture.name: self.architecture.value.name,
156
+ # AutoEvalColumn.backbone.name: self.backbone,
157
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
158
+ AutoEvalColumn.is_domain_specific.name: self.is_domain_specific,
159
+ AutoEvalColumn.use_chat_template.name: self.use_chat_template,
160
  AutoEvalColumn.revision.name: self.revision,
161
  AutoEvalColumn.average.name: average,
162
  AutoEvalColumn.license.name: self.license,
163
  AutoEvalColumn.likes.name: self.likes,
164
  AutoEvalColumn.params.name: self.num_params,
165
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
166
+ AutoEvalColumn.date.name: self.date,
167
  "display_result" : self.display_result,
168
  }
169
 
src/populate.py CHANGED
@@ -10,7 +10,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
14
  # print(raw_data)
15
  # raise Exception("stop")
16
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
14
  # print(raw_data)
15
  # raise Exception("stop")
16
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]