“WadoodAbdul” commited on
Commit
f738aa2
1 Parent(s): 3aa629d

added evaluation metric type radio button

Browse files
app.py CHANGED
@@ -60,12 +60,20 @@ try:
60
  except Exception:
61
  restart_space()
62
 
 
 
 
63
 
64
- raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
65
- datasets_leaderboard_df = datasets_original_df.copy()
 
 
 
 
 
 
 
66
 
67
- raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "clinical_types")
68
- types_leaderboard_df = types_original_df.copy()
69
 
70
  (
71
  finished_eval_queue_df,
@@ -74,6 +82,36 @@ types_leaderboard_df = types_original_df.copy()
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Searching and filtering
78
  def update_table(
79
  hidden_df: pd.DataFrame,
@@ -249,6 +287,12 @@ with demo:
249
  # )
250
  with gr.Column(min_width=320):
251
  # with gr.Box(elem_id="box-filter"):
 
 
 
 
 
 
252
  filter_columns_type = gr.CheckboxGroup(
253
  label="Model Types",
254
  choices=[t.to_str() for t in ModelType],
@@ -270,6 +314,9 @@ with demo:
270
  # interactive=True,
271
  # elem_id="filter-columns-size",
272
  # )
 
 
 
273
  leaderboard_table = gr.components.Dataframe(
274
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
275
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
@@ -286,6 +333,19 @@ with demo:
286
  datatype=TYPES,
287
  visible=False,
288
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  search_bar.submit(
290
  update_table,
291
  [
@@ -317,6 +377,7 @@ with demo:
317
  queue=True,
318
  )
319
 
 
320
  with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
321
  with gr.Row():
322
  with gr.Column():
@@ -343,6 +404,12 @@ with demo:
343
  # value=False, label="Show gated/private/deleted models", interactive=True
344
  # )
345
  with gr.Column(min_width=320):
 
 
 
 
 
 
346
  # with gr.Box(elem_id="box-filter"):
347
  filter_columns_type = gr.CheckboxGroup(
348
  label="Model Types",
@@ -372,6 +439,7 @@ with demo:
372
  # interactive=True,
373
  # elem_id="filter-columns-size",
374
  # )
 
375
 
376
  leaderboard_table = gr.components.Dataframe(
377
  value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
@@ -389,6 +457,19 @@ with demo:
389
  datatype=TYPES,
390
  visible=False,
391
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  search_bar.submit(
393
  update_table,
394
  [
 
60
  except Exception:
61
  restart_space()
62
 
63
+ # Span based results
64
+ _, span_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "datasets")
65
+ span_based_datasets_leaderboard_df = span_based_datasets_original_df.copy()
66
 
67
+ _, span_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "SpanBasedWithPartialOverlap", "clinical_types")
68
+ span_based_types_leaderboard_df = span_based_types_original_df.copy()
69
+
70
+ # Token based results
71
+ _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
72
+ token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
73
+
74
+ _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
75
+ token_based_types_leaderboard_df = token_based_types_original_df.copy()
76
 
 
 
77
 
78
  (
79
  finished_eval_queue_df,
 
82
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
83
 
84
 
85
+ def update_df(evaluation_metric, shown_columns, subset="datasets"):
86
+ print(evaluation_metric)
87
+
88
+ if subset == "datasets":
89
+ match evaluation_metric:
90
+ case "Span Based":
91
+ leaderboard_table_df = span_based_datasets_leaderboard_df.copy()
92
+ hidden_leader_board_df = span_based_datasets_original_df
93
+ case "Token Based":
94
+ leaderboard_table_df = token_based_datasets_leaderboard_df.copy()
95
+ hidden_leader_board_df = token_based_datasets_original_df
96
+ case _:
97
+ pass
98
+ else:
99
+ match evaluation_metric:
100
+ case "Span Based":
101
+ leaderboard_table_df = span_based_types_leaderboard_df.copy()
102
+ hidden_leader_board_df = span_based_types_original_df
103
+ case "Token Based":
104
+ leaderboard_table_df = token_based_types_leaderboard_df.copy()
105
+ hidden_leader_board_df = token_based_types_original_df
106
+ case _:
107
+ pass
108
+
109
+
110
+ value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
111
+
112
+ return leaderboard_table_df[value_cols], hidden_leader_board_df
113
+
114
+
115
  # Searching and filtering
116
  def update_table(
117
  hidden_df: pd.DataFrame,
 
287
  # )
288
  with gr.Column(min_width=320):
289
  # with gr.Box(elem_id="box-filter"):
290
+
291
+ eval_metric = gr.Radio(
292
+ choices=["Span Based", "Token Based"],
293
+ value = "Span Based",
294
+ label="Evaluation Metric",
295
+ )
296
  filter_columns_type = gr.CheckboxGroup(
297
  label="Model Types",
298
  choices=[t.to_str() for t in ModelType],
 
314
  # interactive=True,
315
  # elem_id="filter-columns-size",
316
  # )
317
+
318
+ datasets_leaderboard_df, datasets_original_df = update_df(eval_metric.value, shown_columns.value, subset="datasets")
319
+
320
  leaderboard_table = gr.components.Dataframe(
321
  value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
322
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
 
333
  datatype=TYPES,
334
  visible=False,
335
  )
336
+
337
+ eval_metric.change(
338
+ lambda a, b: update_df(a,b, "datasets") ,
339
+ inputs=[
340
+ eval_metric,
341
+ shown_columns,
342
+ ],
343
+ outputs=[
344
+ leaderboard_table,
345
+ hidden_leaderboard_table_for_search,
346
+ ]
347
+ )
348
+
349
  search_bar.submit(
350
  update_table,
351
  [
 
377
  queue=True,
378
  )
379
 
380
+
381
  with gr.TabItem("🏅 Clinical Types", elem_id="llm-benchmark-tab-table", id=4):
382
  with gr.Row():
383
  with gr.Column():
 
404
  # value=False, label="Show gated/private/deleted models", interactive=True
405
  # )
406
  with gr.Column(min_width=320):
407
+
408
+ eval_metric = gr.Radio(
409
+ choices=["Span Based", "Token Based"],
410
+ value = "Span Based",
411
+ label="Evaluation Metric",
412
+ )
413
  # with gr.Box(elem_id="box-filter"):
414
  filter_columns_type = gr.CheckboxGroup(
415
  label="Model Types",
 
439
  # interactive=True,
440
  # elem_id="filter-columns-size",
441
  # )
442
+ types_leaderboard_df, types_original_df = update_df(eval_metric.value, shown_columns.value, subset="clinical_types")
443
 
444
  leaderboard_table = gr.components.Dataframe(
445
  value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
 
457
  datatype=TYPES,
458
  visible=False,
459
  )
460
+
461
+ eval_metric.change(
462
+ fn=lambda a, b: update_df(a,b, "clinical_types"),
463
+ inputs=[
464
+ eval_metric,
465
+ shown_columns,
466
+ ],
467
+ outputs=[
468
+ leaderboard_table,
469
+ hidden_leaderboard_table_for_search
470
+ ]
471
+ )
472
+
473
  search_bar.submit(
474
  update_table,
475
  [
src/display/utils.py CHANGED
@@ -162,6 +162,9 @@ class PromptTemplateName(Enum):
162
  LLamaNERTemplate = "llama_70B_ner"
163
  # MixtralNERTemplate = "mixtral_ner_v0.3"
164
 
 
 
 
165
 
166
 
167
  # Column selection
 
162
  LLamaNERTemplate = "llama_70B_ner"
163
  # MixtralNERTemplate = "mixtral_ner_v0.3"
164
 
165
+ class EvaluationMetrics(Enum):
166
+ SpanBased = "Span Based"
167
+ TokenBased = "Token Based"
168
 
169
 
170
  # Column selection
src/leaderboard/read_evals.py CHANGED
@@ -36,7 +36,7 @@ class EvalResult:
36
  display_result:bool = True
37
 
38
  @classmethod
39
- def init_from_json_file(self, json_filepath):
40
  """Inits the result from the specific model result file"""
41
  with open(json_filepath) as fp:
42
  data = json.load(fp)
@@ -82,7 +82,7 @@ class EvalResult:
82
  task = task.value
83
 
84
  # We average all scores of a given metric (not all metrics are present in all files)
85
- accs = np.array([v.get(task.metric, None) for k, v in data["dataset_results"].items() if task.benchmark == k])
86
  if accs.size == 0 or any([acc is None for acc in accs]):
87
  continue
88
 
@@ -94,7 +94,7 @@ class EvalResult:
94
  clinical_type = clinical_type.value
95
 
96
  # We average all scores of a given metric (not all metrics are present in all files)
97
- accs = np.array([v.get(clinical_type.metric, None) for k, v in data["clinical_type_results"].items() if clinical_type.benchmark == k])
98
  if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
 
@@ -212,7 +212,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
212
  return request_file
213
 
214
 
215
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
216
  """From the path of the results folder root, extract all needed info for results"""
217
  model_result_filepaths = []
218
 
@@ -233,7 +233,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
233
  eval_results = {}
234
  for model_result_filepath in model_result_filepaths:
235
  # Creation of result
236
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
237
  eval_result.update_with_request_file(requests_path)
238
 
239
  # Store results of same eval together
 
36
  display_result:bool = True
37
 
38
  @classmethod
39
+ def init_from_json_file(self, json_filepath, evaluation_metric):
40
  """Inits the result from the specific model result file"""
41
  with open(json_filepath) as fp:
42
  data = json.load(fp)
 
82
  task = task.value
83
 
84
  # We average all scores of a given metric (not all metrics are present in all files)
85
+ accs = np.array([v.get(task.metric, None) for k, v in data[evaluation_metric]["dataset_results"].items() if task.benchmark == k])
86
  if accs.size == 0 or any([acc is None for acc in accs]):
87
  continue
88
 
 
94
  clinical_type = clinical_type.value
95
 
96
  # We average all scores of a given metric (not all metrics are present in all files)
97
+ accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
98
  if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
 
 
212
  return request_file
213
 
214
 
215
+ def get_raw_eval_results(results_path: str, requests_path: str, evaluation_metric: str) -> list[EvalResult]:
216
  """From the path of the results folder root, extract all needed info for results"""
217
  model_result_filepaths = []
218
 
 
233
  eval_results = {}
234
  for model_result_filepath in model_result_filepaths:
235
  # Creation of result
236
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, evaluation_metric)
237
  eval_result.update_with_request_file(requests_path)
238
 
239
  # Store results of same eval together
src/populate.py CHANGED
@@ -8,9 +8,9 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
14
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)