Commit
•
2b1d96b
1
Parent(s):
07db628
Plot with plotly
Browse files- requirements.txt +2 -0
- src/results.py +13 -10
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
plotly
|
2 |
+
|
src/results.py
CHANGED
@@ -157,8 +157,11 @@ def plot_results(task, *dfs):
|
|
157 |
and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
|
158 |
]
|
159 |
]
|
|
|
|
|
|
|
160 |
if task == "All":
|
161 |
-
df = df[[col for col in df.columns if col.split(".")[1] in
|
162 |
# - IFEval: Calculate average of both strict accuracies
|
163 |
ifeval_mean = df[
|
164 |
[
|
@@ -170,19 +173,19 @@ def plot_results(task, *dfs):
|
|
170 |
loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
|
171 |
df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
|
172 |
# Rename
|
173 |
-
df = df.rename(columns=lambda col:
|
174 |
else:
|
175 |
df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
|
176 |
-
tasks = {key: tupl[0] for key, tupl in TASKS.items()}
|
177 |
-
subtasks = {tupl[1]: tupl[0] for value in constants.SUBTASKS.values() for tupl in value}
|
178 |
-
subtasks = {**tasks, **subtasks}
|
179 |
# - IFEval: Return 4 accuracies
|
180 |
if task == "leaderboard_ifeval":
|
181 |
df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
|
182 |
else:
|
183 |
-
df = df.rename(columns=lambda col: subtasks
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
188 |
return fig
|
|
|
157 |
and (col.endswith("acc,none") or col.endswith("acc_norm,none") or col.endswith("exact_match,none"))
|
158 |
]
|
159 |
]
|
160 |
+
tasks = {key: tupl[0] for key, tupl in TASKS.items()}
|
161 |
+
tasks["leaderboard_math"] = tasks["leaderboard_math_hard"]
|
162 |
+
subtasks = {tupl[1]: tupl[0] for tupl in constants.SUBTASKS.get(task, [])}
|
163 |
if task == "All":
|
164 |
+
df = df[[col for col in df.columns if col.split(".")[1] in tasks]]
|
165 |
# - IFEval: Calculate average of both strict accuracies
|
166 |
ifeval_mean = df[
|
167 |
[
|
|
|
173 |
loc = df.columns.get_loc("results.leaderboard_math_hard.exact_match,none")
|
174 |
df.insert(loc - 1, "results.leaderboard_ifeval", ifeval_mean)
|
175 |
# Rename
|
176 |
+
df = df.rename(columns=lambda col: tasks[col.split(".")[1]])
|
177 |
else:
|
178 |
df = df[[col for col in df.columns if col.startswith(f"results.{task}")]]
|
|
|
|
|
|
|
179 |
# - IFEval: Return 4 accuracies
|
180 |
if task == "leaderboard_ifeval":
|
181 |
df = df.rename(columns=lambda col: col.split(".")[2].removesuffix(",none"))
|
182 |
else:
|
183 |
+
df = df.rename(columns=lambda col: tasks.get(col.split(".")[1], subtasks.get(col.split(".")[1])))
|
184 |
+
fig = df.T.rename_axis(columns="Model").plot.bar(
|
185 |
+
backend="plotly",
|
186 |
+
barmode="group",
|
187 |
+
labels={"index": "Benchmark" if task == "All" else "Subtask", "value": "Score"},
|
188 |
+
color_discrete_sequence=["#FF9D00", "#32343D"],
|
189 |
+
)
|
190 |
+
fig.update_yaxes(range=[0, 1])
|
191 |
return fig
|