Spaces:
Running
Running
Commit
·
0a14325
1
Parent(s):
0da5ee3
[ADD] Med Safety
Browse files- app.py +121 -3
- src/about.py +19 -0
- src/display/utils.py +20 -7
- src/leaderboard/read_evals.py +23 -7
- src/populate.py +9 -2
app.py
CHANGED
@@ -19,11 +19,14 @@ from src.about import (
|
|
19 |
LOGO
|
20 |
)
|
21 |
from src.display.css_html_js import custom_css
|
|
|
22 |
from src.display.utils import (
|
23 |
DATASET_BENCHMARK_COLS,
|
24 |
OPEN_ENDED_BENCHMARK_COLS,
|
|
|
25 |
DATASET_COLS,
|
26 |
OPEN_ENDED_COLS,
|
|
|
27 |
EVAL_COLS,
|
28 |
EVAL_TYPES,
|
29 |
NUMERIC_INTERVALS,
|
@@ -61,12 +64,17 @@ except Exception:
|
|
61 |
restart_space()
|
62 |
|
63 |
# Span based results
|
|
|
|
|
64 |
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
|
65 |
harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
|
66 |
|
67 |
_, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
|
68 |
open_ended_leaderboard_df = open_ended_original_df.copy()
|
69 |
|
|
|
|
|
|
|
70 |
# breakpoint()
|
71 |
# # Token based results
|
72 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
@@ -84,12 +92,16 @@ open_ended_leaderboard_df = open_ended_original_df.copy()
|
|
84 |
|
85 |
|
86 |
def update_df(shown_columns, subset="datasets"):
|
|
|
87 |
if subset == "datasets":
|
88 |
leaderboard_table_df = harness_datasets_leaderboard_df.copy()
|
89 |
hidden_leader_board_df = harness_datasets_original_df
|
90 |
elif subset == "open_ended":
|
91 |
leaderboard_table_df = open_ended_leaderboard_df.copy()
|
92 |
hidden_leader_board_df = open_ended_original_df
|
|
|
|
|
|
|
93 |
# else:
|
94 |
# match evaluation_metric:
|
95 |
# case "Span Based":
|
@@ -432,10 +444,116 @@ with demo:
|
|
432 |
leaderboard_table,
|
433 |
queue=True,
|
434 |
)
|
435 |
-
|
436 |
with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
|
437 |
-
gr.
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
with gr.TabItem("🏅 Cross Examination", elem_id="llm-benchmark-tab-table", id=3):
|
440 |
gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
|
441 |
pass
|
|
|
19 |
LOGO
|
20 |
)
|
21 |
from src.display.css_html_js import custom_css
|
22 |
+
# changes to be made here
|
23 |
from src.display.utils import (
|
24 |
DATASET_BENCHMARK_COLS,
|
25 |
OPEN_ENDED_BENCHMARK_COLS,
|
26 |
+
MED_SAFETY_BENCHMARK_COLS,
|
27 |
DATASET_COLS,
|
28 |
OPEN_ENDED_COLS,
|
29 |
+
MED_SAFETY_COLS,
|
30 |
EVAL_COLS,
|
31 |
EVAL_TYPES,
|
32 |
NUMERIC_INTERVALS,
|
|
|
64 |
restart_space()
|
65 |
|
66 |
# Span based results
|
67 |
+
# changes to be made here
|
68 |
+
|
69 |
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
|
70 |
harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
|
71 |
|
72 |
_, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
|
73 |
open_ended_leaderboard_df = open_ended_original_df.copy()
|
74 |
|
75 |
+
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
|
76 |
+
med_safety_leaderboard_df = med_safety_original_df.copy()
|
77 |
+
|
78 |
# breakpoint()
|
79 |
# # Token based results
|
80 |
# _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
|
|
|
92 |
|
93 |
|
94 |
def update_df(shown_columns, subset="datasets"):
|
95 |
+
# changes to be made here
|
96 |
if subset == "datasets":
|
97 |
leaderboard_table_df = harness_datasets_leaderboard_df.copy()
|
98 |
hidden_leader_board_df = harness_datasets_original_df
|
99 |
elif subset == "open_ended":
|
100 |
leaderboard_table_df = open_ended_leaderboard_df.copy()
|
101 |
hidden_leader_board_df = open_ended_original_df
|
102 |
+
elif subset == "med_safety":
|
103 |
+
leaderboard_table_df = med_safety_leaderboard_df.copy()
|
104 |
+
hidden_leader_board_df = med_safety_original_df
|
105 |
# else:
|
106 |
# match evaluation_metric:
|
107 |
# case "Span Based":
|
|
|
444 |
leaderboard_table,
|
445 |
queue=True,
|
446 |
)
|
|
|
447 |
with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
|
448 |
+
with gr.Row():
|
449 |
+
with gr.Column():
|
450 |
+
with gr.Row():
|
451 |
+
search_bar = gr.Textbox(
|
452 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
453 |
+
show_label=False,
|
454 |
+
elem_id="search-bar",
|
455 |
+
)
|
456 |
+
with gr.Row():
|
457 |
+
shown_columns = gr.CheckboxGroup(
|
458 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
|
459 |
+
value=[
|
460 |
+
c.name
|
461 |
+
for c in fields(AutoEvalColumn)
|
462 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
|
463 |
+
],
|
464 |
+
label="Select columns to show",
|
465 |
+
elem_id="column-select",
|
466 |
+
interactive=True,
|
467 |
+
)
|
468 |
+
# with gr.Row():
|
469 |
+
# deleted_models_visibility = gr.Checkbox(
|
470 |
+
# value=False, label="Show gated/private/deleted models", interactive=True
|
471 |
+
# )
|
472 |
+
with gr.Column(min_width=320):
|
473 |
+
# with gr.Box(elem_id="box-filter"):
|
474 |
+
filter_columns_type = gr.CheckboxGroup(
|
475 |
+
label="Model Types",
|
476 |
+
choices=[t.to_str() for t in ModelType],
|
477 |
+
value=[t.to_str() for t in ModelType],
|
478 |
+
interactive=True,
|
479 |
+
elem_id="filter-columns-type",
|
480 |
+
)
|
481 |
+
# filter_columns_architecture = gr.CheckboxGroup(
|
482 |
+
# label="Architecture Types",
|
483 |
+
# choices=[i.value.name for i in ModelArch],
|
484 |
+
# value=[i.value.name for i in ModelArch],
|
485 |
+
# interactive=True,
|
486 |
+
# elem_id="filter-columns-architecture",
|
487 |
+
# )
|
488 |
+
filter_domain_specific = gr.CheckboxGroup(
|
489 |
+
label="Domain specific models",
|
490 |
+
choices=["Yes", "No"],
|
491 |
+
value=["Yes", "No"],
|
492 |
+
interactive=True,
|
493 |
+
elem_id="filter-columns-type",
|
494 |
+
)
|
495 |
+
filter_columns_size = gr.CheckboxGroup(
|
496 |
+
label="Model sizes (in billions of parameters)",
|
497 |
+
choices=list(NUMERIC_INTERVALS.keys()),
|
498 |
+
value=list(NUMERIC_INTERVALS.keys()),
|
499 |
+
interactive=True,
|
500 |
+
elem_id="filter-columns-size",
|
501 |
+
)
|
502 |
+
|
503 |
+
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
|
504 |
+
|
505 |
+
leaderboard_table = gr.components.Dataframe(
|
506 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
507 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
508 |
+
datatype=TYPES,
|
509 |
+
elem_id="leaderboard-table",
|
510 |
+
interactive=False,
|
511 |
+
visible=True,
|
512 |
+
)
|
513 |
+
|
514 |
+
# Dummy leaderboard for handling the case when the user uses backspace key
|
515 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
516 |
+
value=datasets_original_df[MED_SAFETY_COLS],
|
517 |
+
headers=MED_SAFETY_COLS,
|
518 |
+
datatype=TYPES,
|
519 |
+
visible=False,
|
520 |
+
)
|
521 |
+
|
522 |
+
|
523 |
+
search_bar.submit(
|
524 |
+
update_table,
|
525 |
+
[
|
526 |
+
hidden_leaderboard_table_for_search,
|
527 |
+
shown_columns,
|
528 |
+
search_bar,
|
529 |
+
filter_columns_type,
|
530 |
+
filter_domain_specific,
|
531 |
+
filter_columns_size
|
532 |
+
# filter_columns_architecture
|
533 |
+
],
|
534 |
+
leaderboard_table,
|
535 |
+
)
|
536 |
+
for selector in [
|
537 |
+
shown_columns,
|
538 |
+
filter_columns_type,
|
539 |
+
filter_domain_specific,
|
540 |
+
filter_columns_size,
|
541 |
+
# deleted_models_visibility,
|
542 |
+
]:
|
543 |
+
selector.change(
|
544 |
+
update_table,
|
545 |
+
[
|
546 |
+
hidden_leaderboard_table_for_search,
|
547 |
+
shown_columns,
|
548 |
+
search_bar,
|
549 |
+
filter_columns_type,
|
550 |
+
filter_domain_specific,
|
551 |
+
filter_columns_size
|
552 |
+
],
|
553 |
+
leaderboard_table,
|
554 |
+
queue=True,
|
555 |
+
)
|
556 |
+
|
557 |
with gr.TabItem("🏅 Cross Examination", elem_id="llm-benchmark-tab-table", id=3):
|
558 |
gr.Markdown("# Coming Soon!!!", elem_classes="markdown-text")
|
559 |
pass
|
src/about.py
CHANGED
@@ -37,6 +37,25 @@ class OpenEndedColumns(Enum):
|
|
37 |
column0 = OpenEndedColumn("ELO", "score", "ELO")
|
38 |
column1 = OpenEndedColumn("Score", "score", "Score")
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
42 |
# ---------------------------------------------------
|
|
|
37 |
column0 = OpenEndedColumn("ELO", "score", "ELO")
|
38 |
column1 = OpenEndedColumn("Score", "score", "Score")
|
39 |
|
40 |
+
# changes to be made here
|
41 |
+
|
42 |
+
@dataclass
|
43 |
+
class MedSafetyColumn:
|
44 |
+
benchmark: str
|
45 |
+
metric: str
|
46 |
+
col_name: str
|
47 |
+
|
48 |
+
class MedSafetyColumns(Enum):
|
49 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
50 |
+
med_safety_column0 = MedSafetyColumn("Competence, Compassion, and Respect for Human Dignity", "score", "Competence, Compassion, and Respect for Human Dignity")
|
51 |
+
med_safety_column1 = MedSafetyColumn("Patient Rights and Confidentiality", "score", "Patient Rights and Confidentiality")
|
52 |
+
med_safety_column2 = MedSafetyColumn("Continued Study and Information Sharing", "score", "Continued Study and Information Sharing")
|
53 |
+
med_safety_column3 = MedSafetyColumn("Medical Care for All", "score", "Medical Care for All")
|
54 |
+
med_safety_column4 = MedSafetyColumn("Community and Public Health", "score", "Community and Public Health")
|
55 |
+
med_safety_column5 = MedSafetyColumn("Physician's Freedom of Choice", "score", "Physician's Freedom of Choice")
|
56 |
+
med_safety_column6 = MedSafetyColumn("Professionalism and Honesty", "score", "Professionalism and Honesty")
|
57 |
+
med_safety_column7 = MedSafetyColumn("Responsibility to Patient", "score", "Responsibility to Patient")
|
58 |
+
med_safety_column8 = MedSafetyColumn("Law and Responsibility to Society", "score", "Law and Responsibility to Society")
|
59 |
|
60 |
NUM_FEWSHOT = 0 # Change with your few shot
|
61 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -3,7 +3,8 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
def fields(raw_class):
|
@@ -15,6 +16,7 @@ def fields(raw_class):
|
|
15 |
# when a modif is needed
|
16 |
@dataclass
|
17 |
class ColumnContent:
|
|
|
18 |
name: str
|
19 |
type: str
|
20 |
displayed_by_default: bool
|
@@ -34,11 +36,14 @@ auto_eval_column_dict = []
|
|
34 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
35 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
36 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
37 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, invariant=False)])
|
38 |
for task in HarnessTasks:
|
39 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
40 |
for column in OpenEndedColumns:
|
41 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
|
|
|
|
|
|
|
42 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
43 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
44 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
@@ -57,6 +62,7 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
57 |
|
58 |
|
59 |
## For the queue columns in the submission tab
|
|
|
60 |
@dataclass(frozen=True)
|
61 |
class EvalQueueColumn: # Queue column
|
62 |
model = ColumnContent("model", "markdown", True)
|
@@ -67,6 +73,7 @@ class EvalQueueColumn: # Queue column
|
|
67 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
68 |
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
|
69 |
open_ended_status = ColumnContent("open_ended_status", "str", True)
|
|
|
70 |
|
71 |
## All the model information that we might need
|
72 |
@dataclass
|
@@ -185,10 +192,15 @@ class EvaluationMetrics(Enum):
|
|
185 |
|
186 |
|
187 |
# Column selection
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
194 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
@@ -197,9 +209,10 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
197 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
198 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
199 |
|
|
|
200 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
|
201 |
OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
|
202 |
-
|
203 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
204 |
|
205 |
NUMERIC_INTERVALS = {
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
# changes to be made here
|
7 |
+
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
|
|
16 |
# when a modif is needed
|
17 |
@dataclass
|
18 |
class ColumnContent:
|
19 |
+
# changes to be made here
|
20 |
name: str
|
21 |
type: str
|
22 |
displayed_by_default: bool
|
|
|
36 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
37 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
38 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
39 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
|
40 |
for task in HarnessTasks:
|
41 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
42 |
for column in OpenEndedColumns:
|
43 |
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
|
44 |
+
# changes to be made here
|
45 |
+
for column in MedSafetyColumns:
|
46 |
+
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
|
47 |
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
|
48 |
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
|
49 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
62 |
|
63 |
|
64 |
## For the queue columns in the submission tab
|
65 |
+
# changes to be made here
|
66 |
@dataclass(frozen=True)
|
67 |
class EvalQueueColumn: # Queue column
|
68 |
model = ColumnContent("model", "markdown", True)
|
|
|
73 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
74 |
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
|
75 |
open_ended_status = ColumnContent("open_ended_status", "str", True)
|
76 |
+
med_safety_status = ColumnContent("med_safety_status", "str", True)
|
77 |
|
78 |
## All the model information that we might need
|
79 |
@dataclass
|
|
|
192 |
|
193 |
|
194 |
# Column selection
|
195 |
+
# changes to be made here
|
196 |
+
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
|
197 |
+
OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
|
198 |
+
MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
|
199 |
+
CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
|
200 |
+
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
|
201 |
+
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
|
202 |
+
# MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
|
203 |
+
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
|
204 |
|
205 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
206 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
209 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
210 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
211 |
|
212 |
+
# changes to be made here
|
213 |
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
|
214 |
OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
|
215 |
+
MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
|
216 |
# CROSS_EXAMINATION_BENCHMARK_COLS = [t.value.col_name for t in CrossExaminationTasks]
|
217 |
|
218 |
NUMERIC_INTERVALS = {
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,8 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
|
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -22,6 +23,7 @@ class EvalResult:
|
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
dataset_results: dict
|
|
|
25 |
open_ended_results: dict
|
26 |
med_safety_results: dict
|
27 |
cross_examination_results: dict
|
@@ -104,7 +106,19 @@ class EvalResult:
|
|
104 |
mean_acc = np.mean(accs) # * 100.0
|
105 |
open_ended_results[task.benchmark] = mean_acc
|
106 |
# breakpoint()
|
|
|
107 |
med_safety_results = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
cross_examination_results = {}
|
109 |
# types_results = {}
|
110 |
# for clinical_type in ClinicalTypes:
|
@@ -198,12 +212,14 @@ class EvalResult:
|
|
198 |
for task in OpenEndedColumns:
|
199 |
data_dict[task.value.col_name] = self.open_ended_results[task.value.benchmark]
|
200 |
return data_dict
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
|
|
207 |
|
208 |
# if subset == "cross_examination":
|
209 |
# if len(self.cross_examination_results) > 0:
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
# changes to be made here
|
12 |
+
from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns
|
13 |
from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
|
|
23 |
model: str
|
24 |
revision: str # commit hash, "" if main
|
25 |
dataset_results: dict
|
26 |
+
# changes to be made here
|
27 |
open_ended_results: dict
|
28 |
med_safety_results: dict
|
29 |
cross_examination_results: dict
|
|
|
106 |
mean_acc = np.mean(accs) # * 100.0
|
107 |
open_ended_results[task.benchmark] = mean_acc
|
108 |
# breakpoint()
|
109 |
+
# changes to be made here
|
110 |
med_safety_results = {}
|
111 |
+
if "med-safety" in data["results"]:
|
112 |
+
for task in MedSafetyColumns:
|
113 |
+
task = task.value
|
114 |
+
try:
|
115 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"]["med-safety"].items() if task.benchmark == k])
|
116 |
+
except:
|
117 |
+
accs = np.array([])
|
118 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
119 |
+
continue
|
120 |
+
mean_acc = np.mean(accs) # * 100.0
|
121 |
+
med_safety_results[task.benchmark] = mean_acc
|
122 |
cross_examination_results = {}
|
123 |
# types_results = {}
|
124 |
# for clinical_type in ClinicalTypes:
|
|
|
212 |
for task in OpenEndedColumns:
|
213 |
data_dict[task.value.col_name] = self.open_ended_results[task.value.benchmark]
|
214 |
return data_dict
|
215 |
+
# changes to be made here
|
216 |
+
if subset == "med_safety":
|
217 |
+
average = sum([v for v in self.med_safety_results.values() if v is not None]) / len(MedSafetyColumns)
|
218 |
+
data_dict[AutoEvalColumn.average.name] = average
|
219 |
+
if len(self.med_safety_results) > 0:
|
220 |
+
for task in MedSafetyColumns:
|
221 |
+
data_dict[task.value.col_name] = self.med_safety_results[task.value.benchmark]
|
222 |
+
return data_dict
|
223 |
|
224 |
# if subset == "cross_examination":
|
225 |
# if len(self.cross_examination_results) > 0:
|
src/populate.py
CHANGED
@@ -4,7 +4,8 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
@@ -16,13 +17,15 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
16 |
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
19 |
if subset == "datasets":
|
20 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
|
21 |
elif subset == "open_ended":
|
22 |
df = df.sort_values(by=["ELO"], ascending=False)
|
23 |
cols = list(set(df.columns).intersection(set(cols)))
|
24 |
df = df[cols].round(decimals=2)
|
25 |
-
|
26 |
# filter out if any of the benchmarks have not been produced
|
27 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
28 |
return raw_data, df
|
@@ -39,8 +42,10 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
39 |
data = json.load(fp)
|
40 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
41 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
|
|
42 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
43 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
|
|
44 |
all_evals.append(data)
|
45 |
elif ".md" not in entry:
|
46 |
# this is a folder
|
@@ -54,12 +59,14 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
54 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
55 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
56 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
|
|
57 |
all_evals.append(data)
|
58 |
# breakpoint()
|
59 |
pending_list = []
|
60 |
running_list = []
|
61 |
finished_list = []
|
62 |
for run in all_evals:
|
|
|
63 |
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
|
64 |
status_list = status_list[:2]
|
65 |
if "RUNNING" in status_list:
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
# changes to be made here
|
8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
|
|
17 |
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
18 |
|
19 |
df = pd.DataFrame.from_records(all_data_json)
|
20 |
+
# changes to be made here
|
21 |
if subset == "datasets":
|
22 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
23 |
+
elif subset == "med_safety":
|
24 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
cols = list(set(df.columns).intersection(set(cols)))
|
28 |
df = df[cols].round(decimals=2)
|
|
|
29 |
# filter out if any of the benchmarks have not been produced
|
30 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
31 |
return raw_data, df
|
|
|
42 |
data = json.load(fp)
|
43 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"])
|
44 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
45 |
+
# changes to be made here
|
46 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
47 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
48 |
+
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
49 |
all_evals.append(data)
|
50 |
elif ".md" not in entry:
|
51 |
# this is a folder
|
|
|
59 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
60 |
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
|
61 |
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
|
62 |
+
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
|
63 |
all_evals.append(data)
|
64 |
# breakpoint()
|
65 |
pending_list = []
|
66 |
running_list = []
|
67 |
finished_list = []
|
68 |
for run in all_evals:
|
69 |
+
# changes to be made here
|
70 |
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["cross-examination"]]
|
71 |
status_list = status_list[:2]
|
72 |
if "RUNNING" in status_list:
|