Pratik Bhavsar commited on
Commit
19b159e
·
1 Parent(s): e540986

improved radar chart and added categories

Browse files
Files changed (1) hide show
  1. app.py +74 -53
app.py CHANGED
@@ -5,36 +5,59 @@ import numpy as np
5
  import plotly.graph_objects as go
6
 
7
  df = pd.read_csv("results.csv").dropna()
8
- dataset_columns = df.columns[7:].tolist()
9
 
10
- def create_radar_plot(df, model_name):
11
- model_data = df[df["Model"] == model_name].iloc[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  datasets = df.columns[7:].tolist()
13
- values = [model_data[m] for m in datasets]
14
- values.append(values[0])
15
- datasets.append(datasets[0])
16
-
17
- fig = go.Figure(
18
- data=go.Scatterpolar(
19
- r=values,
20
- theta=datasets,
21
- fill="toself",
22
- fillcolor="rgba(99, 102, 241, 0.3)",
23
- line=dict(color="#4F46E5", width=2),
24
- name=model_name,
25
- text=[f"{val:.3f}" for val in values],
26
- textposition="middle right",
27
- mode="lines+markers+text",
 
 
 
 
 
 
 
 
28
  )
29
- )
30
 
31
  fig.update_layout(
32
  polar=dict(
33
  radialaxis=dict(
34
- visible=True,
35
- range=[0, 1],
36
- showline=False,
37
- tickfont=dict(size=12),
38
  ),
39
  angularaxis=dict(
40
  tickfont=dict(size=13, family="Arial"),
@@ -42,9 +65,9 @@ def create_radar_plot(df, model_name):
42
  direction="clockwise",
43
  ),
44
  ),
45
- showlegend=False,
46
  title=dict(
47
- text=model_name,
48
  x=0.5,
49
  y=0.95,
50
  font=dict(size=24, family="Arial", color="#1F2937"),
@@ -57,13 +80,13 @@ def create_radar_plot(df, model_name):
57
 
58
  return fig
59
 
60
- def model_info_tab(model_name=None):
61
- if model_name is None:
62
- model_name = df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
63
 
64
- filtered_df = df[df["Model"] == model_name]
65
- radar_chart = create_radar_plot(df, model_name)
 
66
 
 
 
67
  info_html = filtered_df[
68
  [
69
  "Model",
@@ -81,14 +104,13 @@ def model_info_tab(model_name=None):
81
 
82
  def get_performance_chart(df):
83
  df_sorted = df.sort_values("Model Avg", ascending=True)
84
- colors = {"Private": "#4169E1", "Open source": "#7B68EE"}
85
 
86
  fig, ax = plt.subplots(figsize=(16, 10))
87
- bar_height = 0.4
88
  bars = ax.barh(
89
  np.arange(len(df_sorted)),
90
  df_sorted["Model Avg"],
91
- height=bar_height,
92
  color=[colors[t] for t in df_sorted["Model Type"]],
93
  )
94
 
@@ -113,12 +135,11 @@ def get_performance_chart(df):
113
  plt.tight_layout()
114
  return fig
115
 
116
-
117
  def get_performance_cost_chart(df):
118
  plt.figure(figsize=(12, 8), dpi=300)
119
  plt.grid(True, linestyle="--", alpha=0.2)
120
 
121
- colors = {"Private": "#6366F1", "Open source": "#22C55E"}
122
  performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
123
 
124
  for _, row in df.iterrows():
@@ -164,28 +185,28 @@ def get_performance_cost_chart(df):
164
  return plt.gcf()
165
 
166
 
167
- def filter_leaderboard(model_type, dataset):
168
  filtered_df = df.copy()
169
  if model_type != "All":
170
  filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
171
 
172
- # Sort by selected dataset and add rank
173
- filtered_df = filtered_df.sort_values(by=dataset, ascending=False)
 
 
 
174
  filtered_df["Rank"] = range(1, len(filtered_df) + 1)
175
 
176
  perf_chart = get_performance_chart(filtered_df)
177
  cost_chart = get_performance_cost_chart(filtered_df)
178
 
179
- # Add Rank as first column
180
  display_columns = [
181
  "Rank",
182
  "Model",
183
  "Model Type",
184
- dataset,
185
  "Input cost per million token",
186
  "Output cost per million token",
187
- "single turn perf",
188
- "multi turn perf",
189
  ]
190
 
191
  table_html = filtered_df[display_columns].to_html(index=False)
@@ -203,10 +224,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
203
  value="All",
204
  label="Model Type",
205
  )
206
- dataset = gr.Dropdown(
207
- choices=["Model Avg"] + dataset_columns,
208
- value="Model Avg",
209
- label="Dataset",
210
  )
211
 
212
  with gr.Column(scale=4):
@@ -215,12 +236,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
215
  plot1 = gr.Plot()
216
  plot2 = gr.Plot()
217
 
218
- for input_comp in [model_type, dataset]:
219
  input_comp.change(
220
  fn=filter_leaderboard,
221
- inputs=[model_type, dataset],
222
  outputs=[output, plot1, plot2],
223
  )
 
224
  with gr.Tab("Model Performance"):
225
  with gr.Row():
226
  with gr.Column(scale=1):
@@ -229,7 +251,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
229
  value=df.sort_values("Model Avg", ascending=False).iloc[0][
230
  "Model"
231
  ],
232
- label="Model",
 
233
  )
234
  with gr.Column(scale=4):
235
  model_info = gr.HTML()
@@ -241,16 +264,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
241
  outputs=[model_info, radar_plot],
242
  )
243
 
244
- # Modify app.load to initialize only leaderboard
245
  app.load(
246
- fn=lambda: filter_leaderboard("All", "Model Avg"),
247
  outputs=[output, plot1, plot2],
248
  )
249
 
250
- # Add separate load event for model info tab
251
  app.load(
252
  fn=lambda: model_info_tab(
253
- df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
254
  ),
255
  outputs=[model_info, radar_plot],
256
  )
 
5
  import plotly.graph_objects as go
6
 
7
  df = pd.read_csv("results.csv").dropna()
 
8
 
9
+ categories = {
10
+ "Overall": ["Model Avg"],
11
+ "Overall single turn": ["single turn perf"],
12
+ "Overall multi turn": ["multi turn perf"],
13
+ "Single func call": [
14
+ "xlam_single_tool_single_call",
15
+ "xlam_multiple_tool_single_call",
16
+ ],
17
+ "Multiple func call": [
18
+ "xlam_multiple_tool_multiple_call",
19
+ "xlam_single_tool_multiple_call",
20
+ "BFCL_v3_multi_turn_base_multi_func_call",
21
+ ],
22
+ "Irrelevant query": ["BFCL_v3_irrelevance"],
23
+ "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
24
+ "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
25
+ "Missing params": ["BFCL_v3_multi_turn_miss_param"],
26
+ "Composite": ["BFCL_v3_multi_turn_composite"],
27
+ }
28
+
29
+
30
+ def create_radar_plot(df, model_names):
31
  datasets = df.columns[7:].tolist()
32
+ fig = go.Figure()
33
+
34
+ colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
35
+ line_colors = ["#4F46E5", "#16A34A"]
36
+
37
+ for idx, model_name in enumerate(model_names):
38
+ model_data = df[df["Model"] == model_name].iloc[0]
39
+ values = [model_data[m] for m in datasets]
40
+ values.append(values[0])
41
+ datasets_plot = datasets + [datasets[0]]
42
+
43
+ fig.add_trace(
44
+ go.Scatterpolar(
45
+ r=values,
46
+ theta=datasets_plot,
47
+ fill="toself",
48
+ fillcolor=colors[idx % len(colors)],
49
+ line=dict(color=line_colors[idx % len(line_colors)], width=2),
50
+ name=model_name,
51
+ text=[f"{val:.3f}" for val in values],
52
+ textposition="middle right",
53
+ mode="lines+markers+text",
54
+ )
55
  )
 
56
 
57
  fig.update_layout(
58
  polar=dict(
59
  radialaxis=dict(
60
+ visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
 
 
 
61
  ),
62
  angularaxis=dict(
63
  tickfont=dict(size=13, family="Arial"),
 
65
  direction="clockwise",
66
  ),
67
  ),
68
+ showlegend=True,
69
  title=dict(
70
+ text="Model Comparison",
71
  x=0.5,
72
  y=0.95,
73
  font=dict(size=24, family="Arial", color="#1F2937"),
 
80
 
81
  return fig
82
 
 
 
 
83
 
84
+ def model_info_tab(model_names=None):
85
+ if model_names is None or len(model_names) == 0:
86
+ model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
87
 
88
+ filtered_df = df[df["Model"].isin(model_names)]
89
+ radar_chart = create_radar_plot(df, model_names)
90
  info_html = filtered_df[
91
  [
92
  "Model",
 
104
 
105
  def get_performance_chart(df):
106
  df_sorted = df.sort_values("Model Avg", ascending=True)
107
+ colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
108
 
109
  fig, ax = plt.subplots(figsize=(16, 10))
 
110
  bars = ax.barh(
111
  np.arange(len(df_sorted)),
112
  df_sorted["Model Avg"],
113
+ height=0.4,
114
  color=[colors[t] for t in df_sorted["Model Type"]],
115
  )
116
 
 
135
  plt.tight_layout()
136
  return fig
137
 
 
138
  def get_performance_cost_chart(df):
139
  plt.figure(figsize=(12, 8), dpi=300)
140
  plt.grid(True, linestyle="--", alpha=0.2)
141
 
142
+ colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
143
  performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
144
 
145
  for _, row in df.iterrows():
 
185
  return plt.gcf()
186
 
187
 
188
+ def filter_leaderboard(model_type, category):
189
  filtered_df = df.copy()
190
  if model_type != "All":
191
  filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
192
 
193
+ dataset_columns = categories.get(category, ["Model Avg"])
194
+ avg_score = filtered_df[dataset_columns].mean(axis=1)
195
+ filtered_df["Category Score"] = avg_score
196
+
197
+ filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
198
  filtered_df["Rank"] = range(1, len(filtered_df) + 1)
199
 
200
  perf_chart = get_performance_chart(filtered_df)
201
  cost_chart = get_performance_cost_chart(filtered_df)
202
 
 
203
  display_columns = [
204
  "Rank",
205
  "Model",
206
  "Model Type",
 
207
  "Input cost per million token",
208
  "Output cost per million token",
209
+ "Category Score",
 
210
  ]
211
 
212
  table_html = filtered_df[display_columns].to_html(index=False)
 
224
  value="All",
225
  label="Model Type",
226
  )
227
+ category = gr.Dropdown(
228
+ choices=list(categories.keys()),
229
+ value=list(categories.keys())[0],
230
+ label="Category",
231
  )
232
 
233
  with gr.Column(scale=4):
 
236
  plot1 = gr.Plot()
237
  plot2 = gr.Plot()
238
 
239
+ for input_comp in [model_type, category]:
240
  input_comp.change(
241
  fn=filter_leaderboard,
242
+ inputs=[model_type, category],
243
  outputs=[output, plot1, plot2],
244
  )
245
+
246
  with gr.Tab("Model Performance"):
247
  with gr.Row():
248
  with gr.Column(scale=1):
 
251
  value=df.sort_values("Model Avg", ascending=False).iloc[0][
252
  "Model"
253
  ],
254
+ multiselect=True,
255
+ label="Models",
256
  )
257
  with gr.Column(scale=4):
258
  model_info = gr.HTML()
 
264
  outputs=[model_info, radar_plot],
265
  )
266
 
 
267
  app.load(
268
+ fn=lambda: filter_leaderboard("All", list(categories.keys())[0]),
269
  outputs=[output, plot1, plot2],
270
  )
271
 
 
272
  app.load(
273
  fn=lambda: model_info_tab(
274
+ [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
275
  ),
276
  outputs=[model_info, radar_plot],
277
  )