djstrong commited on
Commit
ad6c108
·
1 Parent(s): bc4548b

disable psc_g; rag avg

Browse files
src/about.py CHANGED
@@ -28,7 +28,7 @@ class Tasks(Enum):
28
  task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
29
  task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
30
  task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
31
- task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
32
  task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
33
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
34
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
 
28
  task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
29
  task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
30
  task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
31
+ # task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466) # disabled until recalculation
32
  task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
33
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
34
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
src/display/utils.py CHANGED
@@ -34,9 +34,12 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
34
  auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
35
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
36
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
 
37
  for task in Tasks:
38
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
39
  # Model information
 
 
40
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
42
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 
34
  auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
35
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
36
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
37
+
38
  for task in Tasks:
39
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
40
  # Model information
41
+ auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
42
+
43
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
44
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
45
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
src/leaderboard/read_evals.py CHANGED
@@ -166,6 +166,7 @@ class EvalResult:
166
  """Converts the Eval Result to a dict compatible with our dataframe display"""
167
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
168
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
 
169
  all_tasks = g_tasks + mc_tasks
170
  all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
171
 
@@ -188,6 +189,7 @@ class EvalResult:
188
  average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
189
  average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
190
  average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
 
191
 
192
  data_dict = {}
193
  # data_dict = {
@@ -280,6 +282,11 @@ class EvalResult:
280
  except KeyError:
281
  print(f"Could not find average_mc")
282
 
 
 
 
 
 
283
  try:
284
  data_dict[AutoEvalColumn.license.name] = self.license
285
  except KeyError:
 
166
  """Converts the Eval Result to a dict compatible with our dataframe display"""
167
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
168
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
169
+ rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book']
170
  all_tasks = g_tasks + mc_tasks
171
  all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
172
 
 
189
  average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
190
  average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
191
  average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
192
+ average_rag = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in rag_tasks]) / len(rag_tasks)
193
 
194
  data_dict = {}
195
  # data_dict = {
 
282
  except KeyError:
283
  print(f"Could not find average_mc")
284
 
285
+ try:
286
+ data_dict[AutoEvalColumn.average_rag.name] = average_rag
287
+ except KeyError:
288
+ print(f"Could not find average_rag")
289
+
290
  try:
291
  data_dict[AutoEvalColumn.license.name] = self.license
292
  except KeyError: