tathagataraha commited on
Commit
c92b14d
·
1 Parent(s): 7d6aad6

[MODIFY] Metrics for medical summarization, aci bench and soap notes

Browse files
app.py CHANGED
@@ -704,11 +704,11 @@ with demo:
704
  )
705
  with gr.Row():
706
  shown_columns = gr.CheckboxGroup(
707
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
708
  value=[
709
  c.name
710
  for c in fields(AutoEvalColumn)
711
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
712
  ],
713
  label="Select columns to show",
714
  elem_id="column-select",
@@ -814,11 +814,11 @@ with demo:
814
  )
815
  with gr.Row():
816
  shown_columns = gr.CheckboxGroup(
817
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
818
  value=[
819
  c.name
820
  for c in fields(AutoEvalColumn)
821
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
822
  ],
823
  label="Select columns to show",
824
  elem_id="column-select",
 
704
  )
705
  with gr.Row():
706
  shown_columns = gr.CheckboxGroup(
707
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
708
  value=[
709
  c.name
710
  for c in fields(AutoEvalColumn)
711
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
712
  ],
713
  label="Select columns to show",
714
  elem_id="column-select",
 
814
  )
815
  with gr.Row():
816
  shown_columns = gr.CheckboxGroup(
817
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
818
  value=[
819
  c.name
820
  for c in fields(AutoEvalColumn)
821
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
822
  ],
823
  label="Select columns to show",
824
  elem_id="column-select",
src/about.py CHANGED
@@ -79,7 +79,7 @@ class ACIColumns(Enum):
79
  aci_column0 = ACIColumn("coverage", "score", "Coverage")
80
  aci_column1 = ACIColumn("conform", "score", "Conformity")
81
  aci_column2 = ACIColumn("fact", "score", "Consistency")
82
- aci_column3 = ACIColumn("brief", "score", "Conciseness")
83
 
84
  @dataclass
85
  class SOAPColumn:
@@ -91,7 +91,7 @@ class SOAPColumns(Enum):
91
  soap_column0 = SOAPColumn("coverage", "score", "Coverage")
92
  soap_column1 = SOAPColumn("conform", "score", "Conformity")
93
  soap_column2 = SOAPColumn("fact", "score", "Consistency")
94
- soap_column3 = SOAPColumn("brief", "score", "Conciseness")
95
 
96
  NUM_FEWSHOT = 0 # Change with your few shot
97
  # ---------------------------------------------------
 
79
  aci_column0 = ACIColumn("coverage", "score", "Coverage")
80
  aci_column1 = ACIColumn("conform", "score", "Conformity")
81
  aci_column2 = ACIColumn("fact", "score", "Consistency")
82
+ # aci_column3 = ACIColumn("brief", "score", "Conciseness")
83
 
84
  @dataclass
85
  class SOAPColumn:
 
91
  soap_column0 = SOAPColumn("coverage", "score", "Coverage")
92
  soap_column1 = SOAPColumn("conform", "score", "Conformity")
93
  soap_column2 = SOAPColumn("fact", "score", "Consistency")
94
+ # soap_column3 = SOAPColumn("brief", "score", "Conciseness")
95
 
96
  NUM_FEWSHOT = 0 # Change with your few shot
97
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -39,6 +39,7 @@ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent(
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
41
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
 
42
  for task in HarnessTasks:
43
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
44
  for column in OpenEndedColumns:
 
39
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
40
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
41
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, med_safety_col=True, invariant=False)])
42
+ auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
43
  for task in HarnessTasks:
44
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
45
  for column in OpenEndedColumns:
src/leaderboard/read_evals.py CHANGED
@@ -272,15 +272,26 @@ class EvalResult:
272
  return data_dict
273
  if subset == "medical_summarization":
274
  if len(self.medical_summarization_results) > 0:
 
 
 
 
 
 
 
275
  for task in MedicalSummarizationColumns:
276
  data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
277
  return data_dict
278
  if subset == "aci":
 
 
279
  if len(self.aci_results) > 0:
280
  for task in ACIColumns:
281
  data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
282
  return data_dict
283
  if subset == "soap":
 
 
284
  if len(self.soap_results) > 0:
285
  for task in SOAPColumns:
286
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
 
272
  return data_dict
273
  if subset == "medical_summarization":
274
  if len(self.medical_summarization_results) > 0:
275
+ adjusted_conciseness = max(0, self.medical_summarization_results["brief"])
276
+ coverage = self.medical_summarization_results["coverage"]
277
+ hm = 2 / (1/coverage + 1/adjusted_conciseness) if not (adjusted_conciseness == 0 or coverage == 0) else 0
278
+ conformity = self.medical_summarization_results["conform"]
279
+ consistency = self.medical_summarization_results["fact"]
280
+ overall = sum([hm, conformity, consistency]) / 3
281
+ data_dict[AutoEvalColumn.overall.name] = overall
282
  for task in MedicalSummarizationColumns:
283
  data_dict[task.value.col_name] = self.medical_summarization_results[task.value.benchmark]
284
  return data_dict
285
  if subset == "aci":
286
+ overall = sum([v for v in self.aci_results.values() if v is not None]) / len(ACIColumns)
287
+ data_dict[AutoEvalColumn.overall.name] = overall
288
  if len(self.aci_results) > 0:
289
  for task in ACIColumns:
290
  data_dict[task.value.col_name] = self.aci_results[task.value.benchmark]
291
  return data_dict
292
  if subset == "soap":
293
+ overall = sum([v for v in self.soap_results.values() if v is not None]) / len(SOAPColumns)
294
+ data_dict[AutoEvalColumn.overall.name] = overall
295
  if len(self.soap_results) > 0:
296
  for task in SOAPColumns:
297
  data_dict[task.value.col_name] = self.soap_results[task.value.benchmark]
src/populate.py CHANGED
@@ -25,11 +25,11 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
28
- df = df.sort_values(by=["Coverage"], ascending=False)
29
  elif subset == "aci":
30
- df = df.sort_values(by=["Coverage"], ascending=False)
31
  elif subset == "soap":
32
- df = df.sort_values(by=["Coverage"], ascending=False)
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced
 
25
  elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
28
+ df = df.sort_values(by=["Overall Score"], ascending=False)
29
  elif subset == "aci":
30
+ df = df.sort_values(by=["Overall Score"], ascending=False)
31
  elif subset == "soap":
32
+ df = df.sort_values(by=["Overall Score"], ascending=False)
33
  cols = list(set(df.columns).intersection(set(cols)))
34
  df = df[cols].round(decimals=2)
35
  # filter out if any of the benchmarks have not been produced