albertvillanova HF staff commited on
Commit
07db628
1 Parent(s): ea4c670

Plot Results

Browse files
Files changed (2) hide show
  1. app.py +6 -0
  2. src/results.py +43 -0
app.py CHANGED
@@ -19,6 +19,7 @@ from src.results import (
19
  display_results,
20
  fetch_result_paths,
21
  load_results_dataframes,
 
22
  sort_result_paths_per_model,
23
  update_load_results_component,
24
  update_tasks_component,
@@ -62,6 +63,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
62
  visible=False,
63
  )
64
  hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
 
65
  results = gr.HTML()
66
  results_dataframe_1 = gr.Dataframe(visible=False)
67
  results_dataframe_2 = gr.Dataframe(visible=False)
@@ -153,6 +155,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
153
  fn=display_results,
154
  inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
155
  outputs=[results, configs],
 
 
 
 
156
  )
157
  gr.on(
158
  triggers=[clear_results_btn.click, clear_configs_btn.click],
 
19
  display_results,
20
  fetch_result_paths,
21
  load_results_dataframes,
22
+ plot_results,
23
  sort_result_paths_per_model,
24
  update_load_results_component,
25
  update_tasks_component,
 
63
  visible=False,
64
  )
65
  hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
66
+ results_plot = gr.Plot()
67
  results = gr.HTML()
68
  results_dataframe_1 = gr.Dataframe(visible=False)
69
  results_dataframe_2 = gr.Dataframe(visible=False)
 
155
  fn=display_results,
156
  inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
157
  outputs=[results, configs],
158
+ ).then(
159
+ fn=plot_results,
160
+ inputs=[results_task, results_dataframe_1, results_dataframe_2], # results,
161
+ outputs=results_plot,
162
  )
163
  gr.on(
164
  triggers=[clear_results_btn.click, clear_configs_btn.click],
src/results.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import pandas as pd
5
 
6
  import src.constants as constants
 
7
  from src.hub import glob, load_json_file
8
 
9
 
@@ -143,3 +144,45 @@ def clear_results():
143
 
144
  def display_loading_message_for_results():
145
  return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
 
6
  import src.constants as constants
7
+ from src.constants import TASKS
8
  from src.hub import glob, load_json_file
9
 
10
 
 
144
 
145
  def display_loading_message_for_results():
146
  return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
147
+
148
+
149
+ def plot_results(task, *dfs):
150
+ df = concat_results(dfs)
151
+ if df is not None:
152
+ df = df[
153
+ [
154
+ col
155
+ for col in df.columns
156
+ if col.startswith("results.")
157
+ and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
158
+ ]
159
+ ]
160
+ if task == "All":
161
+ df = df[[col for col in df.columns if col.split(".")[1] in TASKS]]
162
+ # - IFEval: Calculate average of both strict accuracies
163
+ ifeval_mean = df[
164
+ [
165
+ "results.leaderboard_ifeval.inst_level_strict_acc,none",
166
+ "results.leaderboard_ifeval.prompt_level_strict_acc,none",
167
+ ]
168
+ ].mean(axis=1)
169
+ df = df.drop(columns=[col for col in df.columns if col.split(".")[1] == "leaderboard_ifeval"])
170
+ loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
171
+ df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
172
+ # Rename
173
+ df = df.rename(columns=lambda col: TASKS[col.split(".")[1]][0])
174
+ else:
175
+ df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
176
+ tasks = {key: tupl[0] for key, tupl in TASKS.items()}
177
+ subtasks = {tupl[1]: tupl[0] for value in constants.SUBTASKS.values() for tupl in value}
178
+ subtasks = {**tasks, **subtasks}
179
+ # - IFEval: Return 4 accuracies
180
+ if task == "leaderboard_ifeval":
181
+ df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
182
+ else:
183
+ df = df.rename(columns=lambda col: subtasks[col.split(".")[1]])
184
+ ax = df.T.rename_axis(columns="Models").plot(kind="bar", ylabel="Scores", rot=45, figsize=(18, 6))
185
+ fig = ax.get_figure()
186
+ fig.autofmt_xdate(rotation=45)
187
+ fig.tight_layout()
188
+ return fig