agent-leaderboard / visualization.py
Pratik Bhavsar
added more info
b9405c8
raw
history blame
7.15 kB
from utils import get_chart_colors
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
def setup_matplotlib():
matplotlib.use("Agg")
plt.close("all")
def get_performance_chart(df, category_name="Overall"):
plt.close("all")
colors = get_chart_colors()
score_column = "Category Score"
df_sorted = df.sort_values(score_column, ascending=True)
height = max(8, len(df_sorted) * 0.8)
fig, ax = plt.subplots(figsize=(16, height))
plt.rcParams.update({"font.size": 12})
fig.patch.set_facecolor(colors["background"])
ax.set_facecolor(colors["background"])
try:
bars = ax.barh(
np.arange(len(df_sorted)),
df_sorted[score_column],
height=0.4,
capstyle="round",
color=[colors[t] for t in df_sorted["Model Type"]],
)
ax.set_title(
f"Model Performance - {category_name}",
pad=20,
fontsize=20,
fontweight="bold",
color=colors["text"],
)
ax.set_xlabel(
"Average Score (Tool Selection Quality)",
fontsize=14,
labelpad=10,
color=colors["text"],
)
ax.set_xlim(0.0, 1.0)
ax.set_yticks(np.arange(len(df_sorted)))
ax.set_yticklabels(df_sorted["Model"], fontsize=12, color=colors["text"])
plt.subplots_adjust(left=0.35)
for i, v in enumerate(df_sorted[score_column]):
ax.text(
v + 0.01,
i,
f"{v:.3f}",
va="center",
fontsize=12,
fontweight="bold",
color=colors["text"],
)
ax.grid(True, axis="x", linestyle="--", alpha=0.2, color=colors["grid"])
ax.spines[["top", "right"]].set_visible(False)
ax.spines[["bottom", "left"]].set_color(colors["grid"])
ax.tick_params(colors=colors["text"])
legend_elements = [
plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
for label, color in {
k: colors[k] for k in ["Private", "Open source"]
}.items()
]
ax.legend(
handles=legend_elements,
title="Model Type",
loc="lower right",
fontsize=12,
title_fontsize=14,
facecolor=colors["background"],
labelcolor=colors["text"],
)
plt.tight_layout()
return fig
finally:
plt.close(fig)
def create_radar_plot(df, model_names):
datasets = [col for col in df.columns[7:] if col != "IO Cost"]
fig = go.Figure()
colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
line_colors = ["#4F46E5", "#16A34A"]
for idx, model_name in enumerate(model_names):
model_data = df[df["Model"] == model_name].iloc[0]
values = [model_data[m] for m in datasets]
values.append(values[0])
datasets_plot = datasets + [datasets[0]]
fig.add_trace(
go.Scatterpolar(
r=values,
theta=datasets_plot,
fill="toself",
fillcolor=colors[idx % len(colors)],
line=dict(color=line_colors[idx % len(line_colors)], width=2),
name=model_name,
text=[f"{val:.3f}" for val in values],
textposition="middle right",
mode="lines+markers+text",
)
)
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
),
angularaxis=dict(
tickfont=dict(size=13, family="Arial"),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.2,
xanchor="center",
x=0.5,
font=dict(size=14),
),
title=dict(
text="Model Comparison",
x=0.5,
y=0.95,
font=dict(size=24, family="Arial", color="#1F2937"),
),
paper_bgcolor="white",
plot_bgcolor="white",
height=700,
width=900,
margin=dict(t=100, b=100, l=80, r=80),
)
return fig
def get_performance_cost_chart(df, category_name="Overall"):
colors = get_chart_colors()
fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
fig.patch.set_facecolor(colors["background"])
ax.set_facecolor(colors["background"])
ax.grid(True, linestyle="--", alpha=0.15, which="both", color=colors["grid"])
score_column = "Category Score"
for _, row in df.iterrows():
color = colors[row["Model Type"]]
size = 100 if row[score_column] > 0.85 else 80
edge_color = (
colors["Private"]
if row["Model Type"] == "Private"
else colors["Open source"]
)
ax.scatter(
row["IO Cost"],
row[score_column] * 100,
c=color,
s=size,
alpha=0.9,
edgecolor=edge_color,
linewidth=1,
zorder=5,
)
bbox_props = dict(
boxstyle="round,pad=0.3", fc=colors["background"], ec="none", alpha=0.8
)
ax.annotate(
f"{row['Model']}\n(${row['IO Cost']:.2f})",
(row["IO Cost"], row[score_column] * 100),
xytext=(5, 5),
textcoords="offset points",
fontsize=8,
color=colors["text"],
bbox=bbox_props,
zorder=6,
)
ax.set_xscale("log")
ax.set_xlim(0.08, 40)
ax.set_ylim(60, 95)
ax.set_xlabel(
"I/O Cost per Million Tokens ($)",
fontsize=10,
labelpad=10,
color=colors["text"],
)
ax.set_ylabel(
"Model Performance Score", fontsize=10, labelpad=10, color=colors["text"]
)
legend_elements = [
plt.scatter([], [], c=colors[label], label=label, s=80)
for label in ["Private", "Open source"]
]
ax.legend(
handles=legend_elements,
loc="upper right",
frameon=True,
facecolor=colors["background"],
edgecolor="none",
fontsize=9,
labelcolor=colors["text"],
)
ax.set_title(
f"Performance vs. Cost - {category_name}",
fontsize=14,
pad=15,
fontweight="bold",
color=colors["text"],
)
for y1, y2, color in zip([85, 75, 60], [95, 85, 75], colors["performance_bands"]):
ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
ax.tick_params(axis="both", which="major", labelsize=9, colors=colors["text"])
ax.tick_params(axis="both", which="minor", labelsize=8, colors=colors["text"])
ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
for spine in ax.spines.values():
spine.set_color(colors["grid"])
plt.tight_layout()
return fig