Commit
•
07db628
1
Parent(s):
ea4c670
Plot Results
Browse files- app.py +6 -0
- src/results.py +43 -0
app.py
CHANGED
@@ -19,6 +19,7 @@ from src.results import (
|
|
19 |
display_results,
|
20 |
fetch_result_paths,
|
21 |
load_results_dataframes,
|
|
|
22 |
sort_result_paths_per_model,
|
23 |
update_load_results_component,
|
24 |
update_tasks_component,
|
@@ -62,6 +63,7 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
62 |
visible=False,
|
63 |
)
|
64 |
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
|
|
|
65 |
results = gr.HTML()
|
66 |
results_dataframe_1 = gr.Dataframe(visible=False)
|
67 |
results_dataframe_2 = gr.Dataframe(visible=False)
|
@@ -153,6 +155,10 @@ with gr.Blocks(fill_height=True, fill_width=True, css=".col_heading {width: 50%}
|
|
153 |
fn=display_results,
|
154 |
inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
|
155 |
outputs=[results, configs],
|
|
|
|
|
|
|
|
|
156 |
)
|
157 |
gr.on(
|
158 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
|
|
19 |
display_results,
|
20 |
fetch_result_paths,
|
21 |
load_results_dataframes,
|
22 |
+
plot_results,
|
23 |
sort_result_paths_per_model,
|
24 |
update_load_results_component,
|
25 |
update_tasks_component,
|
|
|
63 |
visible=False,
|
64 |
)
|
65 |
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
|
66 |
+
results_plot = gr.Plot()
|
67 |
results = gr.HTML()
|
68 |
results_dataframe_1 = gr.Dataframe(visible=False)
|
69 |
results_dataframe_2 = gr.Dataframe(visible=False)
|
|
|
155 |
fn=display_results,
|
156 |
inputs=[results_task, hide_std_errors, show_only_differences, results_dataframe_1, results_dataframe_2],
|
157 |
outputs=[results, configs],
|
158 |
+
).then(
|
159 |
+
fn=plot_results,
|
160 |
+
inputs=[results_task, results_dataframe_1, results_dataframe_2], # results,
|
161 |
+
outputs=results_plot,
|
162 |
)
|
163 |
gr.on(
|
164 |
triggers=[clear_results_btn.click, clear_configs_btn.click],
|
src/results.py
CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
|
|
4 |
import pandas as pd
|
5 |
|
6 |
import src.constants as constants
|
|
|
7 |
from src.hub import glob, load_json_file
|
8 |
|
9 |
|
@@ -143,3 +144,45 @@ def clear_results():
|
|
143 |
|
144 |
def display_loading_message_for_results():
|
145 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
import src.constants as constants
|
7 |
+
from src.constants import TASKS
|
8 |
from src.hub import glob, load_json_file
|
9 |
|
10 |
|
|
|
144 |
|
145 |
def display_loading_message_for_results():
|
146 |
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|
147 |
+
|
148 |
+
|
149 |
+
def plot_results(task, *dfs):
|
150 |
+
df = concat_results(dfs)
|
151 |
+
if df is not None:
|
152 |
+
df = df[
|
153 |
+
[
|
154 |
+
col
|
155 |
+
for col in df.columns
|
156 |
+
if col.startswith("results.")
|
157 |
+
and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
|
158 |
+
]
|
159 |
+
]
|
160 |
+
if task == "All":
|
161 |
+
df = df[[col for col in df.columns if col.split(".")[1] in TASKS]]
|
162 |
+
# - IFEval: Calculate average of both strict accuracies
|
163 |
+
ifeval_mean = df[
|
164 |
+
[
|
165 |
+
"results.leaderboard_ifeval.inst_level_strict_acc,none",
|
166 |
+
"results.leaderboard_ifeval.prompt_level_strict_acc,none",
|
167 |
+
]
|
168 |
+
].mean(axis=1)
|
169 |
+
df = df.drop(columns=[col for col in df.columns if col.split(".")[1] == "leaderboard_ifeval"])
|
170 |
+
loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
|
171 |
+
df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
|
172 |
+
# Rename
|
173 |
+
df = df.rename(columns=lambda col: TASKS[col.split(".")[1]][0])
|
174 |
+
else:
|
175 |
+
df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
|
176 |
+
tasks = {key: tupl[0] for key, tupl in TASKS.items()}
|
177 |
+
subtasks = {tupl[1]: tupl[0] for value in constants.SUBTASKS.values() for tupl in value}
|
178 |
+
subtasks = {**tasks, **subtasks}
|
179 |
+
# - IFEval: Return 4 accuracies
|
180 |
+
if task == "leaderboard_ifeval":
|
181 |
+
df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
|
182 |
+
else:
|
183 |
+
df = df.rename(columns=lambda col: subtasks[col.split(".")[1]])
|
184 |
+
ax = df.T.rename_axis(columns="Models").plot(kind="bar", ylabel="Scores", rot=45, figsize=(18, 6))
|
185 |
+
fig = ax.get_figure()
|
186 |
+
fig.autofmt_xdate(rotation=45)
|
187 |
+
fig.tight_layout()
|
188 |
+
return fig
|