Spaces:
Running
Running
Commit
·
c92b14d
1
Parent(s):
7d6aad6
[MODIFY] Metrics for medical summarization, aci bench and soap notes
Browse files- app.py +4 -4
- src/about.py +2 -2
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +11 -0
- src/populate.py +3 -3
app.py
CHANGED
@@ -704,11 +704,11 @@ with demo:
|
|
704 |
)
|
705 |
with gr.Row():
|
706 |
shown_columns = gr.CheckboxGroup(
|
707 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.
|
708 |
value=[
|
709 |
c.name
|
710 |
for c in fields(AutoEvalColumn)
|
711 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.
|
712 |
],
|
713 |
label="Select columns to show",
|
714 |
elem_id="column-select",
|
@@ -814,11 +814,11 @@ with demo:
|
|
814 |
)
|
815 |
with gr.Row():
|
816 |
shown_columns = gr.CheckboxGroup(
|
817 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.
|
818 |
value=[
|
819 |
c.name
|
820 |
for c in fields(AutoEvalColumn)
|
821 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.
|
822 |
],
|
823 |
label="Select columns to show",
|
824 |
elem_id="column-select",
|
|
|
704 |
)
|
705 |
with gr.Row():
|
706 |
shown_columns = gr.CheckboxGroup(
|
707 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
|
708 |
value=[
|
709 |
c.name
|
710 |
for c in fields(AutoEvalColumn)
|
711 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
|
712 |
],
|
713 |
label="Select columns to show",
|
714 |
elem_id="column-select",
|
|
|
814 |
)
|
815 |
with gr.Row():
|
816 |
shown_columns = gr.CheckboxGroup(
|
817 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
|
818 |
value=[
|
819 |
c.name
|
820 |
for c in fields(AutoEvalColumn)
|
821 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
|
822 |
],
|
823 |
label="Select columns to show",
|
824 |
elem_id="column-select",
|
src/about.py
CHANGED
@@ -79,7 +79,7 @@ class ACIColumns(Enum):
|
|
79 |
aci_column0 = ACIColumn("coverage", "score", "Coverage")
|
80 |
aci_column1 = ACIColumn("conform", "score", "Conformity")
|
81 |
aci_column2 = ACIColumn("fact", "score", "Consistency")
|
82 |
-
aci_column3 = ACIColumn("brief", "score", "Conciseness")
|
83 |
|
84 |
@dataclass
|
85 |
class SOAPColumn:
|
@@ -91,7 +91,7 @@ class SOAPColumns(Enum):
|
|
91 |
soap_column0 = SOAPColumn("coverage", "score", "Coverage")
|
92 |
soap_column1 = SOAPColumn("conform", "score", "Conformity")
|
93 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
94 |
-
soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
95 |
|
96 |
NUM_FEWSHOT = 0 # Change with your few shot
|
97 |
# ---------------------------------------------------
|
|
|
79 |
aci_column0 = ACIColumn("coverage", "score", "Coverage")
|
80 |
aci_column1 = ACIColumn("conform", "score", "Conformity")
|
81 |
aci_column2 = ACIColumn("fact", "score", "Consistency")
|
82 |
+
# aci_column3 = ACIColumn("brief", "score", "Conciseness")
|
83 |
|
84 |
@dataclass
|
85 |
class SOAPColumn:
|
|
|
91 |
soap_column0 = SOAPColumn("coverage", "score", "Coverage")
|
92 |
soap_column1 = SOAPColumn("conform", "score", "Conformity")
|
93 |
soap_column2 = SOAPColumn("fact", "score", "Consistency")
|
94 |
+
# soap_column3 = SOAPColumn("brief", "score", "Conciseness")
|
95 |
|
96 |
NUM_FEWSHOT = 0 # Change with your few shot
|
97 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -39,6 +39,7 @@ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent(
|
|
39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
41 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
|
|
|
42 |
for task in HarnessTasks:
|
43 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
44 |
for column in OpenEndedColumns:
|
|
|
39 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
40 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
|
41 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
|
42 |
+
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
|
43 |
for task in HarnessTasks:
|
44 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
|
45 |
for column in OpenEndedColumns:
|
src/leaderboard/read_evals.py
CHANGED
@@ -272,15 +272,26 @@ class EvalResult:
|
|
272 |
return data_dict
|
273 |
if subset == "medical_summarization":
|
274 |
if len(self.medical_summarization_results) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
for task in MedicalSummarizationColumns:
|
276 |
data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
|
277 |
return data_dict
|
278 |
if subset == "aci":
|
|
|
|
|
279 |
if len(self.aci_results) > 0:
|
280 |
for task in ACIColumns:
|
281 |
data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
|
282 |
return data_dict
|
283 |
if subset == "soap":
|
|
|
|
|
284 |
if len(self.soap_results) > 0:
|
285 |
for task in SOAPColumns:
|
286 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
|
|
272 |
return data_dict
|
273 |
if subset == "medical_summarization":
|
274 |
if len(self.medical_summarization_results) > 0:
|
275 |
+
adjusted_conciseness = max(0, self.medical_summarization_results["brief"])
|
276 |
+
coverage = self.medical_summarization_results["coverage"]
|
277 |
+
hm = 2 / (1/coverage + 1/adjusted_conciseness) if not (adjusted_conciseness == 0 or coverage == 0) else 0
|
278 |
+
conformity = self.medical_summarization_results["conform"]
|
279 |
+
consistency = self.medical_summarization_results["fact"]
|
280 |
+
overall = sum([hm, conformity, consistency]) / 3
|
281 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
282 |
for task in MedicalSummarizationColumns:
|
283 |
data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
|
284 |
return data_dict
|
285 |
if subset == "aci":
|
286 |
+
overall = sum([v for v in self.aci_results.values() if v is not None]) / len(ACIColumns)
|
287 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
288 |
if len(self.aci_results) > 0:
|
289 |
for task in ACIColumns:
|
290 |
data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
|
291 |
return data_dict
|
292 |
if subset == "soap":
|
293 |
+
overall = sum([v for v in self.soap_results.values() if v is not None]) / len(SOAPColumns)
|
294 |
+
data_dict[AutoEvalColumn.overall.name] = overall
|
295 |
if len(self.soap_results) > 0:
|
296 |
for task in SOAPColumns:
|
297 |
data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
|
src/populate.py
CHANGED
@@ -25,11 +25,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
elif subset == "medical_summarization":
|
28 |
-
df = df.sort_values(by=["
|
29 |
elif subset == "aci":
|
30 |
-
df = df.sort_values(by=["
|
31 |
elif subset == "soap":
|
32 |
-
df = df.sort_values(by=["
|
33 |
cols = list(set(df.columns).intersection(set(cols)))
|
34 |
df = df[cols].round(decimals=2)
|
35 |
# filter out if any of the benchmarks have not been produced
|
|
|
25 |
elif subset == "open_ended":
|
26 |
df = df.sort_values(by=["ELO"], ascending=False)
|
27 |
elif subset == "medical_summarization":
|
28 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
29 |
elif subset == "aci":
|
30 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
31 |
elif subset == "soap":
|
32 |
+
df = df.sort_values(by=["Overall Score"], ascending=False)
|
33 |
cols = list(set(df.columns).intersection(set(cols)))
|
34 |
df = df[cols].round(decimals=2)
|
35 |
# filter out if any of the benchmarks have not been produced
|