tathagataraha commited on
Commit
0a14325
·
1 Parent(s): 0da5ee3

[ADD] Med Safety

Browse files
Files changed (5) hide show
  1. app.py +121 -3
  2. src/about.py +19 -0
  3. src/display/utils.py +20 -7
  4. src/leaderboard/read_evals.py +23 -7
  5. src/populate.py +9 -2
app.py CHANGED
@@ -19,11 +19,14 @@ from src.about import (
19
  LOGO
20
  )
21
  from src.display.css_html_js import custom_css
 
22
  from src.display.utils import (
23
  DATASET_BENCHMARK_COLS,
24
  OPEN_ENDED_BENCHMARK_COLS,
 
25
  DATASET_COLS,
26
  OPEN_ENDED_COLS,
 
27
  EVAL_COLS,
28
  EVAL_TYPES,
29
  NUMERIC_INTERVALS,
@@ -61,12 +64,17 @@ except Exception:
61
  restart_space()
62
 
63
  # Span based results
 
 
64
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
65
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
66
 
67
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
68
  open_ended_leaderboard_df = open_ended_original_df.copy()
69
 
 
 
 
70
  # breakpoint()
71
  # # Token based results
72
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
@@ -84,12 +92,16 @@ open_ended_leaderboard_df = open_ended_original_df.copy()
84
 
85
 
86
  def update_df(shown_columns, subset="datasets"):
 
87
  if subset == "datasets":
88
  leaderboard_table_df = harness_datasets_leaderboard_df.copy()
89
  hidden_leader_board_df = harness_datasets_original_df
90
  elif subset == "open_ended":
91
  leaderboard_table_df = open_ended_leaderboard_df.copy()
92
  hidden_leader_board_df = open_ended_original_df
 
 
 
93
  # else:
94
  # match evaluation_metric:
95
  # case "Span Based":
@@ -432,10 +444,116 @@ with demo:
432
  leaderboard_table,
433
  queue=True,
434
  )
435
-
436
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
437
- gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
438
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  with gr.TabItem("🏅 Cross Examination", elem_id="llm-benchmark-tab-table", id=3):
440
  gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
441
  pass
 
19
  LOGO
20
  )
21
  from src.display.css_html_js import custom_css
22
+ # changes to be made here
23
  from src.display.utils import (
24
  DATASET_BENCHMARK_COLS,
25
  OPEN_ENDED_BENCHMARK_COLS,
26
+ MED_SAFETY_BENCHMARK_COLS,
27
  DATASET_COLS,
28
  OPEN_ENDED_COLS,
29
+ MED_SAFETY_COLS,
30
  EVAL_COLS,
31
  EVAL_TYPES,
32
  NUMERIC_INTERVALS,
 
64
  restart_space()
65
 
66
  # Span based results
67
+ # changes to be made here
68
+
69
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
70
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
71
 
72
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
73
  open_ended_leaderboard_df = open_ended_original_df.copy()
74
 
75
+ _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
76
+ med_safety_leaderboard_df = med_safety_original_df.copy()
77
+
78
  # breakpoint()
79
  # # Token based results
80
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
 
92
 
93
 
94
  def update_df(shown_columns, subset="datasets"):
95
+ # changes to be made here
96
  if subset == "datasets":
97
  leaderboard_table_df = harness_datasets_leaderboard_df.copy()
98
  hidden_leader_board_df = harness_datasets_original_df
99
  elif subset == "open_ended":
100
  leaderboard_table_df = open_ended_leaderboard_df.copy()
101
  hidden_leader_board_df = open_ended_original_df
102
+ elif subset == "med_safety":
103
+ leaderboard_table_df = med_safety_leaderboard_df.copy()
104
+ hidden_leader_board_df = med_safety_original_df
105
  # else:
106
  # match evaluation_metric:
107
  # case "Span Based":
 
444
  leaderboard_table,
445
  queue=True,
446
  )
 
447
  with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
448
+ with gr.Row():
449
+ with gr.Column():
450
+ with gr.Row():
451
+ search_bar = gr.Textbox(
452
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
453
+ show_label=False,
454
+ elem_id="search-bar",
455
+ )
456
+ with gr.Row():
457
+ shown_columns = gr.CheckboxGroup(
458
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
459
+ value=[
460
+ c.name
461
+ for c in fields(AutoEvalColumn)
462
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
463
+ ],
464
+ label="Select columns to show",
465
+ elem_id="column-select",
466
+ interactive=True,
467
+ )
468
+ # with gr.Row():
469
+ # deleted_models_visibility = gr.Checkbox(
470
+ # value=False, label="Show gated/private/deleted models", interactive=True
471
+ # )
472
+ with gr.Column(min_width=320):
473
+ # with gr.Box(elem_id="box-filter"):
474
+ filter_columns_type = gr.CheckboxGroup(
475
+ label="Model Types",
476
+ choices=[t.to_str() for t in ModelType],
477
+ value=[t.to_str() for t in ModelType],
478
+ interactive=True,
479
+ elem_id="filter-columns-type",
480
+ )
481
+ # filter_columns_architecture = gr.CheckboxGroup(
482
+ # label="Architecture Types",
483
+ # choices=[i.value.name for i in ModelArch],
484
+ # value=[i.value.name for i in ModelArch],
485
+ # interactive=True,
486
+ # elem_id="filter-columns-architecture",
487
+ # )
488
+ filter_domain_specific = gr.CheckboxGroup(
489
+ label="Domain specific models",
490
+ choices=["Yes", "No"],
491
+ value=["Yes", "No"],
492
+ interactive=True,
493
+ elem_id="filter-columns-type",
494
+ )
495
+ filter_columns_size = gr.CheckboxGroup(
496
+ label="Model sizes (in billions of parameters)",
497
+ choices=list(NUMERIC_INTERVALS.keys()),
498
+ value=list(NUMERIC_INTERVALS.keys()),
499
+ interactive=True,
500
+ elem_id="filter-columns-size",
501
+ )
502
+
503
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
504
+
505
+ leaderboard_table = gr.components.Dataframe(
506
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
507
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
508
+ datatype=TYPES,
509
+ elem_id="leaderboard-table",
510
+ interactive=False,
511
+ visible=True,
512
+ )
513
+
514
+ # Dummy leaderboard for handling the case when the user uses backspace key
515
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
516
+ value=datasets_original_df[MED_SAFETY_COLS],
517
+ headers=MED_SAFETY_COLS,
518
+ datatype=TYPES,
519
+ visible=False,
520
+ )
521
+
522
+
523
+ search_bar.submit(
524
+ update_table,
525
+ [
526
+ hidden_leaderboard_table_for_search,
527
+ shown_columns,
528
+ search_bar,
529
+ filter_columns_type,
530
+ filter_domain_specific,
531
+ filter_columns_size
532
+ # filter_columns_architecture
533
+ ],
534
+ leaderboard_table,
535
+ )
536
+ for selector in [
537
+ shown_columns,
538
+ filter_columns_type,
539
+ filter_domain_specific,
540
+ filter_columns_size,
541
+ # deleted_models_visibility,
542
+ ]:
543
+ selector.change(
544
+ update_table,
545
+ [
546
+ hidden_leaderboard_table_for_search,
547
+ shown_columns,
548
+ search_bar,
549
+ filter_columns_type,
550
+ filter_domain_specific,
551
+ filter_columns_size
552
+ ],
553
+ leaderboard_table,
554
+ queue=True,
555
+ )
556
+
557
  with gr.TabItem("🏅 Cross Examination", elem_id="llm-benchmark-tab-table", id=3):
558
  gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
559
  pass
src/about.py CHANGED
@@ -37,6 +37,25 @@ class OpenEndedColumns(Enum):
37
  column0 = OpenEndedColumn("ELO", "score", "ELO")
38
  column1 = OpenEndedColumn("Score", "score", "Score")
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
42
  # ---------------------------------------------------
 
37
  column0 = OpenEndedColumn("ELO", "score", "ELO")
38
  column1 = OpenEndedColumn("Score", "score", "Score")
39
 
40
+ # changes to be made here
41
+
42
+ @dataclass
43
+ class MedSafetyColumn:
44
+ benchmark: str
45
+ metric: str
46
+ col_name: str
47
+
48
+ class MedSafetyColumns(Enum):
49
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
50
+ med_safety_column0 = MedSafetyColumn("Competence, Compassion, and Respect for Human Dignity", "score", "Competence, Compassion, and Respect for Human Dignity")
51
+ med_safety_column1 = MedSafetyColumn("Patient Rights and Confidentiality", "score", "Patient Rights and Confidentiality")
52
+ med_safety_column2 = MedSafetyColumn("Continued Study and Information Sharing", "score", "Continued Study and Information Sharing")
53
+ med_safety_column3 = MedSafetyColumn("Medical Care for All", "score", "Medical Care for All")
54
+ med_safety_column4 = MedSafetyColumn("Community and Public Health", "score", "Community and Public Health")
55
+ med_safety_column5 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
56
+ med_safety_column6 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
57
+ med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
58
+ med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
59
 
60
  NUM_FEWSHOT = 0 # Change with your few shot
61
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -3,7 +3,8 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import HarnessTasks, OpenEndedColumns
 
7
 
8
 
9
  def fields(raw_class):
@@ -15,6 +16,7 @@ def fields(raw_class):
15
  # when a modif is needed
16
  @dataclass
17
  class ColumnContent:
 
18
  name: str
19
  type: str
20
  displayed_by_default: bool
@@ -34,11 +36,14 @@ auto_eval_column_dict = []
34
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
35
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
36
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
37
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
38
  for task in HarnessTasks:
39
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
40
  for column in OpenEndedColumns:
41
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
 
 
 
42
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
43
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
44
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
@@ -57,6 +62,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
57
 
58
 
59
  ## For the queue columns in the submission tab
 
60
  @dataclass(frozen=True)
61
  class EvalQueueColumn: # Queue column
62
  model = ColumnContent("model", "markdown", True)
@@ -67,6 +73,7 @@ class EvalQueueColumn: # Queue column
67
  weight_type = ColumnContent("weight_type", "str", "Original")
68
  closed_ended_status = ColumnContent("closed_ended_status", "str", True)
69
  open_ended_status = ColumnContent("open_ended_status", "str", True)
 
70
 
71
  ## All the model information that we might need
72
  @dataclass
@@ -185,10 +192,15 @@ class EvaluationMetrics(Enum):
185
 
186
 
187
  # Column selection
188
- DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
189
- OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
190
- MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
191
- CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
 
 
 
 
 
192
 
193
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
194
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -197,9 +209,10 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
197
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
198
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
199
 
 
200
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
201
  OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
202
- # MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyTasks]
203
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
204
 
205
  NUMERIC_INTERVALS = {
 
3
 
4
  import pandas as pd
5
 
6
+ # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns
8
 
9
 
10
  def fields(raw_class):
 
16
  # when a modif is needed
17
  @dataclass
18
  class ColumnContent:
19
+ # changes to be made here
20
  name: str
21
  type: str
22
  displayed_by_default: bool
 
36
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
37
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
38
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
39
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
40
  for task in HarnessTasks:
41
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
42
  for column in OpenEndedColumns:
43
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
44
+ # changes to be made here
45
+ for column in MedSafetyColumns:
46
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
47
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
48
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
49
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
62
 
63
 
64
  ## For the queue columns in the submission tab
65
+ # changes to be made here
66
  @dataclass(frozen=True)
67
  class EvalQueueColumn: # Queue column
68
  model = ColumnContent("model", "markdown", True)
 
73
  weight_type = ColumnContent("weight_type", "str", "Original")
74
  closed_ended_status = ColumnContent("closed_ended_status", "str", True)
75
  open_ended_status = ColumnContent("open_ended_status", "str", True)
76
+ med_safety_status = ColumnContent("med_safety_status", "str", True)
77
 
78
  ## All the model information that we might need
79
  @dataclass
 
192
 
193
 
194
  # Column selection
195
+ # changes to be made here
196
+ DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
197
+ OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
198
+ MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
199
+ CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
200
+ # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
201
+ # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
202
+ # MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
203
+ # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
204
 
205
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
206
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
209
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
210
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
211
 
212
+ # changes to be made here
213
  DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
214
  OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
215
+ MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
216
  # CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
217
 
218
  NUMERIC_INTERVALS = {
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,8 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns
 
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -22,6 +23,7 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  dataset_results: dict
 
25
  open_ended_results: dict
26
  med_safety_results: dict
27
  cross_examination_results: dict
@@ -104,7 +106,19 @@ class EvalResult:
104
  mean_acc = np.mean(accs) # * 100.0
105
  open_ended_results[task.benchmark] = mean_acc
106
  # breakpoint()
 
107
  med_safety_results = {}
 
 
 
 
 
 
 
 
 
 
 
108
  cross_examination_results = {}
109
  # types_results = {}
110
  # for clinical_type in ClinicalTypes:
@@ -198,12 +212,14 @@ class EvalResult:
198
  for task in OpenEndedColumns:
199
  data_dict[task.value.col_name] = self.open_ended_results[task.value.benchmark]
200
  return data_dict
201
-
202
- # if subset == "med_safety":
203
- # if len(self.med_safety_results) > 0:
204
- # for task in MedSafetyTasks:
205
- # data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
206
- # return data_dict
 
 
207
 
208
  # if subset == "cross_examination":
209
  # if len(self.cross_examination_results) > 0:
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns
13
  from src.submission.check_validity import is_model_on_hub
14
 
15
 
 
23
  model: str
24
  revision: str # commit hash, "" if main
25
  dataset_results: dict
26
+ # changes to be made here
27
  open_ended_results: dict
28
  med_safety_results: dict
29
  cross_examination_results: dict
 
106
  mean_acc = np.mean(accs) # * 100.0
107
  open_ended_results[task.benchmark] = mean_acc
108
  # breakpoint()
109
+ # changes to be made here
110
  med_safety_results = {}
111
+ if "med-safety" in data["results"]:
112
+ for task in MedSafetyColumns:
113
+ task = task.value
114
+ try:
115
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"]["med-safety"].items() if task.benchmark == k])
116
+ except:
117
+ accs = np.array([])
118
+ if accs.size == 0 or any([acc is None for acc in accs]):
119
+ continue
120
+ mean_acc = np.mean(accs) # * 100.0
121
+ med_safety_results[task.benchmark] = mean_acc
122
  cross_examination_results = {}
123
  # types_results = {}
124
  # for clinical_type in ClinicalTypes:
 
212
  for task in OpenEndedColumns:
213
  data_dict[task.value.col_name] = self.open_ended_results[task.value.benchmark]
214
  return data_dict
215
+ # changes to be made here
216
+ if subset == "med_safety":
217
+ average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
218
+ data_dict[AutoEvalColumn.average.name] = average
219
+ if len(self.med_safety_results) > 0:
220
+ for task in MedSafetyColumns:
221
+ data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
222
+ return data_dict
223
 
224
  # if subset == "cross_examination":
225
  # if len(self.cross_examination_results) > 0:
src/populate.py CHANGED
@@ -4,7 +4,8 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
@@ -16,13 +17,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
 
19
  if subset == "datasets":
20
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
21
  elif subset == "open_ended":
22
  df = df.sort_values(by=["ELO"], ascending=False)
23
  cols = list(set(df.columns).intersection(set(cols)))
24
  df = df[cols].round(decimals=2)
25
-
26
  # filter out if any of the benchmarks have not been produced
27
  df = df[has_no_nan_values(df, benchmark_cols)]
28
  return raw_data, df
@@ -39,8 +42,10 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  data = json.load(fp)
40
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
41
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
42
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
43
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
 
44
  all_evals.append(data)
45
  elif ".md" not in entry:
46
  # this is a folder
@@ -54,12 +59,14 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
54
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
55
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
56
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
 
57
  all_evals.append(data)
58
  # breakpoint()
59
  pending_list = []
60
  running_list = []
61
  finished_list = []
62
  for run in all_evals:
 
63
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
64
  status_list = status_list[:2]
65
  if "RUNNING" in status_list:
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
 
17
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
+ # changes to be made here
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
+ elif subset == "med_safety":
24
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  cols = list(set(df.columns).intersection(set(cols)))
28
  df = df[cols].round(decimals=2)
 
29
  # filter out if any of the benchmarks have not been produced
30
  df = df[has_no_nan_values(df, benchmark_cols)]
31
  return raw_data, df
 
42
  data = json.load(fp)
43
  data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
44
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
45
+ # changes to be made here
46
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
47
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
48
+ data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
49
  all_evals.append(data)
50
  elif ".md" not in entry:
51
  # this is a folder
 
59
  data[EvalQueueColumn.revision.name] = data.get("revision", "main")
60
  data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
61
  data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
62
+ data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
63
  all_evals.append(data)
64
  # breakpoint()
65
  pending_list = []
66
  running_list = []
67
  finished_list = []
68
  for run in all_evals:
69
+ # changes to be made here
70
  status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
71
  status_list = status_list[:2]
72
  if "RUNNING" in status_list: