Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
19b159e
1
Parent(s):
e540986
improved radar chart and added categories
Browse files
app.py
CHANGED
@@ -5,36 +5,59 @@ import numpy as np
|
|
5 |
import plotly.graph_objects as go
|
6 |
|
7 |
df = pd.read_csv("results.csv").dropna()
|
8 |
-
dataset_columns = df.columns[7:].tolist()
|
9 |
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
datasets = df.columns[7:].tolist()
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
)
|
29 |
-
)
|
30 |
|
31 |
fig.update_layout(
|
32 |
polar=dict(
|
33 |
radialaxis=dict(
|
34 |
-
visible=True,
|
35 |
-
range=[0, 1],
|
36 |
-
showline=False,
|
37 |
-
tickfont=dict(size=12),
|
38 |
),
|
39 |
angularaxis=dict(
|
40 |
tickfont=dict(size=13, family="Arial"),
|
@@ -42,9 +65,9 @@ def create_radar_plot(df, model_name):
|
|
42 |
direction="clockwise",
|
43 |
),
|
44 |
),
|
45 |
-
showlegend=
|
46 |
title=dict(
|
47 |
-
text=
|
48 |
x=0.5,
|
49 |
y=0.95,
|
50 |
font=dict(size=24, family="Arial", color="#1F2937"),
|
@@ -57,13 +80,13 @@ def create_radar_plot(df, model_name):
|
|
57 |
|
58 |
return fig
|
59 |
|
60 |
-
def model_info_tab(model_name=None):
|
61 |
-
if model_name is None:
|
62 |
-
model_name = df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
|
63 |
|
64 |
-
|
65 |
-
|
|
|
66 |
|
|
|
|
|
67 |
info_html = filtered_df[
|
68 |
[
|
69 |
"Model",
|
@@ -81,14 +104,13 @@ def model_info_tab(model_name=None):
|
|
81 |
|
82 |
def get_performance_chart(df):
|
83 |
df_sorted = df.sort_values("Model Avg", ascending=True)
|
84 |
-
colors = {"Private": "#
|
85 |
|
86 |
fig, ax = plt.subplots(figsize=(16, 10))
|
87 |
-
bar_height = 0.4
|
88 |
bars = ax.barh(
|
89 |
np.arange(len(df_sorted)),
|
90 |
df_sorted["Model Avg"],
|
91 |
-
height=
|
92 |
color=[colors[t] for t in df_sorted["Model Type"]],
|
93 |
)
|
94 |
|
@@ -113,12 +135,11 @@ def get_performance_chart(df):
|
|
113 |
plt.tight_layout()
|
114 |
return fig
|
115 |
|
116 |
-
|
117 |
def get_performance_cost_chart(df):
|
118 |
plt.figure(figsize=(12, 8), dpi=300)
|
119 |
plt.grid(True, linestyle="--", alpha=0.2)
|
120 |
|
121 |
-
colors = {"Private": "#
|
122 |
performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
|
123 |
|
124 |
for _, row in df.iterrows():
|
@@ -164,28 +185,28 @@ def get_performance_cost_chart(df):
|
|
164 |
return plt.gcf()
|
165 |
|
166 |
|
167 |
-
def filter_leaderboard(model_type,
|
168 |
filtered_df = df.copy()
|
169 |
if model_type != "All":
|
170 |
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
171 |
|
172 |
-
|
173 |
-
|
|
|
|
|
|
|
174 |
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
175 |
|
176 |
perf_chart = get_performance_chart(filtered_df)
|
177 |
cost_chart = get_performance_cost_chart(filtered_df)
|
178 |
|
179 |
-
# Add Rank as first column
|
180 |
display_columns = [
|
181 |
"Rank",
|
182 |
"Model",
|
183 |
"Model Type",
|
184 |
-
dataset,
|
185 |
"Input cost per million token",
|
186 |
"Output cost per million token",
|
187 |
-
"
|
188 |
-
"multi turn perf",
|
189 |
]
|
190 |
|
191 |
table_html = filtered_df[display_columns].to_html(index=False)
|
@@ -203,10 +224,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
203 |
value="All",
|
204 |
label="Model Type",
|
205 |
)
|
206 |
-
|
207 |
-
choices=
|
208 |
-
value=
|
209 |
-
label="
|
210 |
)
|
211 |
|
212 |
with gr.Column(scale=4):
|
@@ -215,12 +236,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
215 |
plot1 = gr.Plot()
|
216 |
plot2 = gr.Plot()
|
217 |
|
218 |
-
for input_comp in [model_type,
|
219 |
input_comp.change(
|
220 |
fn=filter_leaderboard,
|
221 |
-
inputs=[model_type,
|
222 |
outputs=[output, plot1, plot2],
|
223 |
)
|
|
|
224 |
with gr.Tab("Model Performance"):
|
225 |
with gr.Row():
|
226 |
with gr.Column(scale=1):
|
@@ -229,7 +251,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
229 |
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
230 |
"Model"
|
231 |
],
|
232 |
-
|
|
|
233 |
)
|
234 |
with gr.Column(scale=4):
|
235 |
model_info = gr.HTML()
|
@@ -241,16 +264,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
241 |
outputs=[model_info, radar_plot],
|
242 |
)
|
243 |
|
244 |
-
# Modify app.load to initialize only leaderboard
|
245 |
app.load(
|
246 |
-
fn=lambda: filter_leaderboard("All",
|
247 |
outputs=[output, plot1, plot2],
|
248 |
)
|
249 |
|
250 |
-
# Add separate load event for model info tab
|
251 |
app.load(
|
252 |
fn=lambda: model_info_tab(
|
253 |
-
df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
|
254 |
),
|
255 |
outputs=[model_info, radar_plot],
|
256 |
)
|
|
|
5 |
import plotly.graph_objects as go
|
6 |
|
7 |
df = pd.read_csv("results.csv").dropna()
|
|
|
8 |
|
9 |
+
categories = {
|
10 |
+
"Overall": ["Model Avg"],
|
11 |
+
"Overall single turn": ["single turn perf"],
|
12 |
+
"Overall multi turn": ["multi turn perf"],
|
13 |
+
"Single func call": [
|
14 |
+
"xlam_single_tool_single_call",
|
15 |
+
"xlam_multiple_tool_single_call",
|
16 |
+
],
|
17 |
+
"Multiple func call": [
|
18 |
+
"xlam_multiple_tool_multiple_call",
|
19 |
+
"xlam_single_tool_multiple_call",
|
20 |
+
"BFCL_v3_multi_turn_base_multi_func_call",
|
21 |
+
],
|
22 |
+
"Irrelevant query": ["BFCL_v3_irrelevance"],
|
23 |
+
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
|
24 |
+
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
|
25 |
+
"Missing params": ["BFCL_v3_multi_turn_miss_param"],
|
26 |
+
"Composite": ["BFCL_v3_multi_turn_composite"],
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
def create_radar_plot(df, model_names):
|
31 |
datasets = df.columns[7:].tolist()
|
32 |
+
fig = go.Figure()
|
33 |
+
|
34 |
+
colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
|
35 |
+
line_colors = ["#4F46E5", "#16A34A"]
|
36 |
+
|
37 |
+
for idx, model_name in enumerate(model_names):
|
38 |
+
model_data = df[df["Model"] == model_name].iloc[0]
|
39 |
+
values = [model_data[m] for m in datasets]
|
40 |
+
values.append(values[0])
|
41 |
+
datasets_plot = datasets + [datasets[0]]
|
42 |
+
|
43 |
+
fig.add_trace(
|
44 |
+
go.Scatterpolar(
|
45 |
+
r=values,
|
46 |
+
theta=datasets_plot,
|
47 |
+
fill="toself",
|
48 |
+
fillcolor=colors[idx % len(colors)],
|
49 |
+
line=dict(color=line_colors[idx % len(line_colors)], width=2),
|
50 |
+
name=model_name,
|
51 |
+
text=[f"{val:.3f}" for val in values],
|
52 |
+
textposition="middle right",
|
53 |
+
mode="lines+markers+text",
|
54 |
+
)
|
55 |
)
|
|
|
56 |
|
57 |
fig.update_layout(
|
58 |
polar=dict(
|
59 |
radialaxis=dict(
|
60 |
+
visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
|
|
|
|
|
|
|
61 |
),
|
62 |
angularaxis=dict(
|
63 |
tickfont=dict(size=13, family="Arial"),
|
|
|
65 |
direction="clockwise",
|
66 |
),
|
67 |
),
|
68 |
+
showlegend=True,
|
69 |
title=dict(
|
70 |
+
text="Model Comparison",
|
71 |
x=0.5,
|
72 |
y=0.95,
|
73 |
font=dict(size=24, family="Arial", color="#1F2937"),
|
|
|
80 |
|
81 |
return fig
|
82 |
|
|
|
|
|
|
|
83 |
|
84 |
+
def model_info_tab(model_names=None):
|
85 |
+
if model_names is None or len(model_names) == 0:
|
86 |
+
model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
87 |
|
88 |
+
filtered_df = df[df["Model"].isin(model_names)]
|
89 |
+
radar_chart = create_radar_plot(df, model_names)
|
90 |
info_html = filtered_df[
|
91 |
[
|
92 |
"Model",
|
|
|
104 |
|
105 |
def get_performance_chart(df):
|
106 |
df_sorted = df.sort_values("Model Avg", ascending=True)
|
107 |
+
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
108 |
|
109 |
fig, ax = plt.subplots(figsize=(16, 10))
|
|
|
110 |
bars = ax.barh(
|
111 |
np.arange(len(df_sorted)),
|
112 |
df_sorted["Model Avg"],
|
113 |
+
height=0.4,
|
114 |
color=[colors[t] for t in df_sorted["Model Type"]],
|
115 |
)
|
116 |
|
|
|
135 |
plt.tight_layout()
|
136 |
return fig
|
137 |
|
|
|
138 |
def get_performance_cost_chart(df):
|
139 |
plt.figure(figsize=(12, 8), dpi=300)
|
140 |
plt.grid(True, linestyle="--", alpha=0.2)
|
141 |
|
142 |
+
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
143 |
performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
|
144 |
|
145 |
for _, row in df.iterrows():
|
|
|
185 |
return plt.gcf()
|
186 |
|
187 |
|
188 |
+
def filter_leaderboard(model_type, category):
|
189 |
filtered_df = df.copy()
|
190 |
if model_type != "All":
|
191 |
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
192 |
|
193 |
+
dataset_columns = categories.get(category, ["Model Avg"])
|
194 |
+
avg_score = filtered_df[dataset_columns].mean(axis=1)
|
195 |
+
filtered_df["Category Score"] = avg_score
|
196 |
+
|
197 |
+
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
|
198 |
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
199 |
|
200 |
perf_chart = get_performance_chart(filtered_df)
|
201 |
cost_chart = get_performance_cost_chart(filtered_df)
|
202 |
|
|
|
203 |
display_columns = [
|
204 |
"Rank",
|
205 |
"Model",
|
206 |
"Model Type",
|
|
|
207 |
"Input cost per million token",
|
208 |
"Output cost per million token",
|
209 |
+
"Category Score",
|
|
|
210 |
]
|
211 |
|
212 |
table_html = filtered_df[display_columns].to_html(index=False)
|
|
|
224 |
value="All",
|
225 |
label="Model Type",
|
226 |
)
|
227 |
+
category = gr.Dropdown(
|
228 |
+
choices=list(categories.keys()),
|
229 |
+
value=list(categories.keys())[0],
|
230 |
+
label="Category",
|
231 |
)
|
232 |
|
233 |
with gr.Column(scale=4):
|
|
|
236 |
plot1 = gr.Plot()
|
237 |
plot2 = gr.Plot()
|
238 |
|
239 |
+
for input_comp in [model_type, category]:
|
240 |
input_comp.change(
|
241 |
fn=filter_leaderboard,
|
242 |
+
inputs=[model_type, category],
|
243 |
outputs=[output, plot1, plot2],
|
244 |
)
|
245 |
+
|
246 |
with gr.Tab("Model Performance"):
|
247 |
with gr.Row():
|
248 |
with gr.Column(scale=1):
|
|
|
251 |
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
252 |
"Model"
|
253 |
],
|
254 |
+
multiselect=True,
|
255 |
+
label="Models",
|
256 |
)
|
257 |
with gr.Column(scale=4):
|
258 |
model_info = gr.HTML()
|
|
|
264 |
outputs=[model_info, radar_plot],
|
265 |
)
|
266 |
|
|
|
267 |
app.load(
|
268 |
+
fn=lambda: filter_leaderboard("All", list(categories.keys())[0]),
|
269 |
outputs=[output, plot1, plot2],
|
270 |
)
|
271 |
|
|
|
272 |
app.load(
|
273 |
fn=lambda: model_info_tab(
|
274 |
+
[df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
275 |
),
|
276 |
outputs=[model_info, radar_plot],
|
277 |
)
|