Pratik Bhavsar commited on
Commit
b0ce6f5
·
1 Parent(s): df66c39

added data exploration

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +7 -8
  2. chat.py +266 -115
  3. data_loader.py +19 -5
  4. get_exp_data.ipynb +167 -0
  5. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
  6. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
  7. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
  8. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
  9. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
  10. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
  11. output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
  12. output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet +0 -0
  13. output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
  14. output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
  15. output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
  16. output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
  17. output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
  18. output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
  19. output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
  20. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
  21. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
  22. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
  23. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
  24. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
  25. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
  26. output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
  27. output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet +0 -0
  28. output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
  29. output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
  30. output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
  31. output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
  32. output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
  33. output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
  34. output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
  35. output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet +0 -0
  36. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
  37. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
  38. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet +0 -0
  39. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet +0 -0
  40. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet +0 -0
  41. output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet +0 -0
  42. output/claude-3-5-haiku-20241022/tau_long_context.parquet +0 -0
  43. output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet +0 -0
  44. output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet +0 -0
  45. output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet +0 -0
  46. output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet +0 -0
  47. output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet +0 -0
  48. output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet +0 -0
  49. output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet +0 -0
  50. output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet +0 -0
app.py CHANGED
@@ -3,7 +3,6 @@ import promptquality as pq
3
  from dotenv import load_dotenv
4
 
5
  load_dotenv()
6
- pq.login("https://console.demo.rungalileo.io")
7
 
8
  from data_loader import (
9
  load_data,
@@ -36,9 +35,9 @@ def create_app():
36
 
37
  mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
38
 
39
- # exp_outputs = create_exploration_tab(
40
- # df, MODELS, DATASETS, SCORES, HEADER_CONTENT
41
- # )
42
 
43
  # Initial loads
44
  app.load(
@@ -55,10 +54,10 @@ def create_app():
55
  outputs=[mc_info, mc_plot],
56
  )
57
 
58
- # app.load(
59
- # fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], [], 0),
60
- # outputs=exp_outputs,
61
- # )
62
 
63
  return app
64
 
 
3
  from dotenv import load_dotenv
4
 
5
  load_dotenv()
 
6
 
7
  from data_loader import (
8
  load_data,
 
35
 
36
  mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
37
 
38
+ exp_outputs = create_exploration_tab(
39
+ df, MODELS, DATASETS, SCORES, HEADER_CONTENT
40
+ )
41
 
42
  # Initial loads
43
  app.load(
 
54
  outputs=[mc_info, mc_plot],
55
  )
56
 
57
+ app.load(
58
+ fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], 0, 1, 0),
59
+ outputs=exp_outputs,
60
+ )
61
 
62
  return app
63
 
chat.py CHANGED
@@ -1,199 +1,350 @@
1
- # chat.py
2
  import gradio as gr
3
- import json
4
  import pandas as pd
5
- import numpy as np
6
- from functools import lru_cache
7
- import promptquality as pq
8
-
9
- project_name = "agent-lb-v1"
10
- PROJECT_ID = pq.get_project_from_name(project_name).id
11
-
12
-
13
- @lru_cache(maxsize=1000)
14
- def get_model_score_for_dataset(model, dataset):
15
- print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
16
- run_name = f"{model} {dataset}"
17
- run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
18
- rows = pq.get_rows(
19
- project_id=PROJECT_ID,
20
- run_id=run_id,
21
- task_type=None,
22
- config=None,
23
- starting_token=0,
24
- limit=1000,
25
- )
26
 
27
- rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
28
- scores = [
29
- round(d.metrics.tool_selection_quality, 2)
30
- for d, rationale in zip(rows, rationales)
31
- if rationale
32
- ]
33
- explanations = [
34
- d.metrics.tool_selection_quality_explanation
35
- for d, rationale in zip(rows, rationales)
36
- if rationale
 
 
 
 
 
 
 
37
  ]
38
- rationales = [r for r in rationales if r]
39
- mean_score = round(np.mean(scores), 2)
40
- return {
41
- "mean_score": mean_score,
42
- "scores": scores,
43
- "rationales": rationales,
44
- "explanations": explanations,
45
- }
46
-
47
-
48
- def get_updated_df(df, data):
49
- df["rationale"] = data["rationales"]
50
- df["explanation"] = data["explanations"]
51
- df["score"] = data["scores"]
52
- return df
53
 
54
 
55
  def get_chat_and_score_df(model, dataset):
56
- data = get_model_score_for_dataset(model, dataset)
57
  df = pd.read_parquet(f"datasets/{dataset}.parquet")
58
- df = get_updated_df(df, data)
59
  return df
60
 
61
 
62
- def format_chat_message(role, content):
63
- """Format individual chat messages with proper styling."""
64
  role_style = role.lower()
 
 
 
 
 
 
 
 
 
 
65
  return f"""
66
- <div class="message {role_style}">
67
- <div class="role-badge {role_style}-role">{role}</div>
68
- <div class="content">{content}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  </div>
70
  """
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def format_tool_info(tools):
74
- """Format tool information with proper styling."""
75
  if isinstance(tools, str):
76
  try:
77
  tools = json.loads(tools)
78
  except:
79
- return "<div>No tool information available</div>"
80
 
81
  if not tools:
82
- return "<div>No tool information available</div>"
83
 
84
  tool_html = ""
85
  for tool in tools:
 
86
  tool_html += f"""
87
- <div class="tool-section">
88
- <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
89
- <div class="tool-description">{tool.get('description', 'No description available')}</div>
90
- <div class="tool-parameters">
91
- {format_parameters(tool.get('parameters', {}))}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  </div>
93
  </div>
94
  """
95
- return f'<div class="tool-info-panel">{tool_html}</div>'
96
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  def format_parameters(parameters):
99
  if not parameters:
100
- return "<div>No parameters</div>"
101
 
102
  params_html = ""
103
  for name, desc in parameters.items():
104
  params_html += f"""
105
- <div class="parameter">
106
- <span class="param-name">{name}:</span> {desc}
 
 
 
 
 
 
 
 
 
 
 
107
  </div>
108
  """
109
  return params_html
110
 
111
-
112
  def format_metrics(score, rationale, explanation):
113
- """Format metrics display with proper styling."""
 
 
 
 
 
114
  return f"""
115
- <div class="metrics-panel">
116
- <div class="metric-section">
117
- <h3>Score</h3>
118
- <div class="score-display">{score:.2f}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  </div>
120
- <div class="metric-section">
121
- <h3>Rationale</h3>
122
- <div class="explanation-text">{rationale}</div>
 
 
 
 
 
 
 
 
123
  </div>
124
- <div class="metric-section">
125
- <h3>Explanation</h3>
126
- <div class="explanation-text">{explanation}</div>
 
 
 
 
 
 
 
 
127
  </div>
128
  </div>
129
  """
130
 
131
-
132
  def update_chat_display(df, index):
133
- """Update the chat visualization for a specific index."""
134
  if df is None or df.empty or index >= len(df):
135
  return (
136
- "<div>No data available</div>",
137
- "<div>No metrics available</div>",
138
- "<div>No tool information available</div>",
139
  )
140
 
141
  row = df.iloc[index]
142
 
143
- # Format chat messages
144
  messages = json.loads(row["conversation"])
 
 
 
 
 
 
 
 
 
 
 
145
  chat_html = f"""
146
- <div class="chat-panel">
147
- {"".join([format_chat_message(msg["role"], msg["content"])
148
- for msg in messages])}
 
 
 
 
149
  </div>
150
  """
151
 
152
- # Format metrics
153
  metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
154
-
155
- # Format tool info
156
  tool_html = format_tool_info(row["tools_langchain"])
157
 
158
  return chat_html, metrics_html, tool_html
159
 
160
 
161
- def filter_and_update_display(model, dataset, selected_scores, current_index):
162
  try:
163
- # Get data and filter by scores
164
  df_chat = get_chat_and_score_df(model, dataset)
165
- if selected_scores:
166
- df_chat = df_chat[df_chat["score"].isin(selected_scores)]
 
167
 
168
  if df_chat.empty:
169
  return (
170
- "<div>No data available for selected filters</div>",
171
- "<div>No metrics available</div>",
172
- "<div>No tool information available</div>",
173
- gr.update(maximum=0, value=0),
174
  "0/0",
175
  )
176
 
177
- # Update index bounds
178
  max_index = len(df_chat) - 1
179
  current_index = min(current_index, max_index)
180
-
181
- # Get displays for current index
182
  chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
183
 
184
- return (
185
- chat_html,
186
- metrics_html,
187
- tool_html,
188
- gr.update(maximum=max_index, value=current_index),
189
- f"{current_index + 1}/{len(df_chat)}",
190
- )
191
  except Exception as e:
192
- print(f"Error in filter_and_update_display: {str(e)}")
 
 
 
 
 
 
 
 
 
193
  return (
194
- f"<div>Error: {str(e)}</div>",
195
- "<div>No metrics available</div>",
196
- "<div>No tool information available</div>",
197
- gr.update(maximum=0, value=0),
198
  "0/0",
199
  )
 
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+
6
+ def get_updated_df(df, df_output):
7
+ df = df.iloc[: len(df_output)].copy()
8
+ df["response"] = df_output["response"].tolist()
9
+ df["rationale"] = df_output["rationale"].tolist()
10
+ df["explanation"] = df_output["explanation"].tolist()
11
+ df["score"] = df_output["score"].tolist()
12
+ cols = [
13
+ "conversation",
14
+ "tools_langchain",
15
+ "n_turns",
16
+ "len_query",
17
+ "n_tools",
18
+ "response",
19
+ "rationale",
20
+ "explanation",
21
+ "score",
22
  ]
23
+ return df[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def get_chat_and_score_df(model, dataset):
27
+ df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
28
  df = pd.read_parquet(f"datasets/{dataset}.parquet")
29
+ df = get_updated_df(df, df_output)
30
  return df
31
 
32
 
33
+ def format_chat_message(role, content, is_response=False):
34
+ """Format individual chat messages with alignment based on role."""
35
  role_style = role.lower()
36
+ alignment = "flex-end" if role_style == "user" else "flex-start"
37
+ max_width = "80%"
38
+
39
+ # Clean up any excessive whitespace while preserving intentional line breaks
40
+ cleaned_content = "\n".join(line.strip() for line in content.split("\n"))
41
+
42
+ background_color = (
43
+ "var(--response-bg)" if is_response else f"var(--message-bg-{role_style})"
44
+ )
45
+
46
  return f"""
47
+ <div style="
48
+ display: flex;
49
+ justify-content: {alignment};
50
+ margin: 0.75rem 0;">
51
+ <div style="
52
+ max-width: {max_width};
53
+ padding: 1rem;
54
+ border-radius: 12px;
55
+ background-color: {background_color};
56
+ border: 1px solid var(--border-color);
57
+ box-shadow: 0 1px 2px var(--shadow-color);">
58
+ <div style="
59
+ font-weight: 600;
60
+ color: var(--primary-text);
61
+ margin-bottom: 0.5rem;
62
+ font-size: 0.9rem;
63
+ text-transform: uppercase;">
64
+ {role + (" Response" if is_response else "")}
65
+ </div>
66
+ <div style="
67
+ color: var(--text-color);
68
+ line-height: 1.6;
69
+ white-space: pre-wrap;
70
+ font-family: {is_response and 'monospace' or 'inherit'};
71
+ font-size: {is_response and '0.9rem' or 'inherit'};">
72
+ {cleaned_content}
73
+ </div>
74
+ </div>
75
  </div>
76
  """
77
 
78
 
79
+ def format_response(response):
80
+ """Format the response data, handling both JSON and text."""
81
+ try:
82
+ # Try to parse as JSON
83
+ response_data = json.loads(response)
84
+ # Format JSON response nicely
85
+ formatted_response = json.dumps(response_data, indent=2)
86
+ except (json.JSONDecodeError, TypeError):
87
+ # If not JSON, use as is
88
+ formatted_response = str(response)
89
+
90
+ return formatted_response
91
+
92
+
93
+ def parse_tool_schema(tool):
94
+ """Parse tool schema to extract name, description, and parameters properly."""
95
+ name = tool.get("title", "Unnamed Tool")
96
+ description = tool.get("description", "No description available")
97
+
98
+ parameters = {}
99
+ if "properties" in tool:
100
+ for param_name, param_data in tool["properties"].items():
101
+ param_desc = param_data.get("description", "No description")
102
+ param_type = param_data.get("type", "unknown")
103
+ parameters[param_name] = f"{param_desc} (Type: {param_type})"
104
+
105
+ return name, description, parameters
106
+
107
+
108
  def format_tool_info(tools):
109
+ """Format tool information with improved schema parsing and dark theme support."""
110
  if isinstance(tools, str):
111
  try:
112
  tools = json.loads(tools)
113
  except:
114
+ return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
115
 
116
  if not tools:
117
+ return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
118
 
119
  tool_html = ""
120
  for tool in tools:
121
+ name, description, parameters = parse_tool_schema(tool)
122
  tool_html += f"""
123
+ <div style="
124
+ margin: 1rem 0;
125
+ padding: 1.5rem;
126
+ border-radius: 8px;
127
+ background-color: var(--surface-color);
128
+ border: 1px solid var(--border-color);">
129
+ <div style="
130
+ font-weight: 600;
131
+ color: var(--primary-text);
132
+ margin-bottom: 0.75rem;
133
+ font-size: 1.1rem;">
134
+ {name}
135
+ </div>
136
+ <div style="
137
+ color: var(--text-color);
138
+ margin-bottom: 1rem;
139
+ line-height: 1.5;">
140
+ {description}
141
+ </div>
142
+ <div style="
143
+ background-color: var(--surface-color-alt);
144
+ padding: 1rem;
145
+ border-radius: 4px;
146
+ border: 1px solid var(--border-color);">
147
+ {format_parameters(parameters)}
148
  </div>
149
  </div>
150
  """
151
+ return f"""
152
+ <div style="
153
+ max-height: 600px;
154
+ overflow-y: auto;
155
+ padding-right: 0.5rem;">
156
+ <style>
157
+ :root[data-theme="light"] {{
158
+ --surface-color: #f8f9fa;
159
+ --surface-color-alt: #ffffff;
160
+ --text-color: #202124;
161
+ --text-muted: #666666;
162
+ --primary-text: #1a73e8;
163
+ --border-color: #e9ecef;
164
+ --shadow-color: rgba(0,0,0,0.1);
165
+ --message-bg-user: #E5F6FD;
166
+ --message-bg-assistant: #F7F7F8;
167
+ --message-bg-system: #FFF3E0;
168
+ --score-high: #1a73e8;
169
+ --score-med: #f4b400;
170
+ --score-low: #ea4335;
171
+ }}
172
+
173
+ :root[data-theme="dark"] {{
174
+ --surface-color: #1e1e1e;
175
+ --surface-color-alt: #2d2d2d;
176
+ --text-color: #ffffff;
177
+ --text-muted: #a0a0a0;
178
+ --primary-text: #60a5fa;
179
+ --border-color: #404040;
180
+ --shadow-color: rgba(0,0,0,0.3);
181
+ --message-bg-user: #2d3748;
182
+ --message-bg-assistant: #1a1a1a;
183
+ --message-bg-system: #2c2516;
184
+ --response-bg: #2a2f3a;
185
+ --score-high: #60a5fa;
186
+ --score-med: #fbbf24;
187
+ --score-low: #ef4444;
188
+ }}
189
+ </style>
190
+ {tool_html}
191
+ </div>
192
+ """
193
 
194
  def format_parameters(parameters):
195
  if not parameters:
196
+ return '<div style="color: var(--text-muted);">No parameters</div>'
197
 
198
  params_html = ""
199
  for name, desc in parameters.items():
200
  params_html += f"""
201
+ <div style="margin: 0.75rem 0;">
202
+ <div style="
203
+ font-weight: 500;
204
+ color: var(--primary-text);
205
+ margin-bottom: 0.25rem;">
206
+ {name}
207
+ </div>
208
+ <div style="
209
+ color: var(--text-color);
210
+ line-height: 1.4;
211
+ font-size: 0.95rem;">
212
+ {desc}
213
+ </div>
214
  </div>
215
  """
216
  return params_html
217
 
 
218
  def format_metrics(score, rationale, explanation):
219
+ """Format metrics display with improved dark theme support."""
220
+ score_color = (
221
+ "var(--score-high)"
222
+ if score >= 0.7
223
+ else "var(--score-med)" if score >= 0.4 else "var(--score-low)"
224
+ )
225
  return f"""
226
+ <div style="
227
+ padding: 1.5rem;
228
+ background-color: var(--surface-color);
229
+ border-radius: 8px;
230
+ border: 1px solid var(--border-color);
231
+ box-shadow: 0 2px 4px var(--shadow-color);">
232
+ <div style="margin-bottom: 1.5rem;">
233
+ <h3 style="
234
+ color: var(--text-color);
235
+ font-size: 1.1rem;
236
+ margin-bottom: 0.5rem;
237
+ font-weight: 600;">TSQ Score</h3>
238
+ <div style="
239
+ font-size: 2rem;
240
+ font-weight: 600;
241
+ color: {score_color};">
242
+ {score:.2f}
243
+ </div>
244
  </div>
245
+ <div style="margin-bottom: 1.5rem;">
246
+ <h3 style="
247
+ color: var(--text-color);
248
+ font-size: 1.1rem;
249
+ margin-bottom: 0.5rem;
250
+ font-weight: 600;">Rationale</h3>
251
+ <div style="
252
+ color: var(--text-color);
253
+ line-height: 1.5;">
254
+ {rationale}
255
+ </div>
256
  </div>
257
+ <div>
258
+ <h3 style="
259
+ color: var(--text-color);
260
+ font-size: 1.1rem;
261
+ margin-bottom: 0.5rem;
262
+ font-weight: 600;">Explanation</h3>
263
+ <div style="
264
+ color: var(--text-color);
265
+ line-height: 1.5;">
266
+ {explanation}
267
+ </div>
268
  </div>
269
  </div>
270
  """
271
 
 
272
  def update_chat_display(df, index):
273
+ """Update the chat visualization with improved dark theme support."""
274
  if df is None or df.empty or index >= len(df):
275
  return (
276
+ '<div style="padding: 1rem; color: var(--text-muted);">No data available</div>',
277
+ '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
278
+ '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
279
  )
280
 
281
  row = df.iloc[index]
282
 
 
283
  messages = json.loads(row["conversation"])
284
+ response = row["response"]
285
+ formatted_response = format_response(response)
286
+
287
+ # Create list of all messages including the response
288
+ all_messages = [
289
+ format_chat_message(msg["role"], msg["content"]) for msg in messages
290
+ ]
291
+ all_messages.append(
292
+ format_chat_message("Assistant", formatted_response, is_response=True)
293
+ )
294
+
295
  chat_html = f"""
296
+ <div style="
297
+ background-color: var(--surface-color);
298
+ border-radius: 8px;
299
+ border: 1px solid var(--border-color);
300
+ box-shadow: 0 2px 4px var(--shadow-color);
301
+ padding: 1.5rem;">
302
+ {"".join(all_messages)}
303
  </div>
304
  """
305
 
 
306
  metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
 
 
307
  tool_html = format_tool_info(row["tools_langchain"])
308
 
309
  return chat_html, metrics_html, tool_html
310
 
311
 
312
+ def filter_and_update_display(model, dataset, min_score, max_score, current_index):
313
  try:
 
314
  df_chat = get_chat_and_score_df(model, dataset)
315
+ df_chat = df_chat[
316
+ (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
317
+ ]
318
 
319
  if df_chat.empty:
320
  return (
321
+ '<div style="padding: 1rem; color: var(--text-muted);">No data available for selected filters</div>',
322
+ '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
323
+ '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
 
324
  "0/0",
325
  )
326
 
 
327
  max_index = len(df_chat) - 1
328
  current_index = min(current_index, max_index)
 
 
329
  chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
330
 
331
+ index_display = f'<div style="font-weight: 500; color: var(--primary-text);">{current_index + 1}/{len(df_chat)}</div>'
332
+ return chat_html, metrics_html, tool_html, index_display
333
+
 
 
 
 
334
  except Exception as e:
335
+ error_html = f"""
336
+ <div style="
337
+ padding: 1rem;
338
+ color: var(--score-low);
339
+ background-color: var(--surface-color);
340
+ border: 1px solid var(--score-low);
341
+ border-radius: 4px;">
342
+ Error: {str(e)}
343
+ </div>
344
+ """
345
  return (
346
+ error_html,
347
+ '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
348
+ '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
 
349
  "0/0",
350
  )
data_loader.py CHANGED
@@ -1,11 +1,25 @@
1
  import pandas as pd
2
- from glob import glob
3
- import numpy as np
4
- from pathlib import Path
5
 
6
 
7
- DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
8
- SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def load_data():
11
  """Load and preprocess the data."""
 
1
  import pandas as pd
 
 
 
2
 
3
 
4
+ DATASETS = [
5
+ "BFCL_v3_irrelevance",
6
+ "BFCL_v3_multi_turn_base_multi_func_call",
7
+ "BFCL_v3_multi_turn_base_single_func_call",
8
+ "BFCL_v3_multi_turn_composite",
9
+ "BFCL_v3_multi_turn_long_context",
10
+ "BFCL_v3_multi_turn_miss_func",
11
+ "BFCL_v3_multi_turn_miss_param",
12
+ "tau_long_context",
13
+ "toolace_single_func_call_1",
14
+ "toolace_single_func_call_2",
15
+ "xlam_multiple_tool_multiple_call",
16
+ "xlam_multiple_tool_single_call",
17
+ "xlam_single_tool_multiple_call",
18
+ "xlam_single_tool_single_call",
19
+ "xlam_tool_miss",
20
+ ]
21
+
22
+ SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
23
 
24
  def load_data():
25
  """Load and preprocess the data."""
get_exp_data.ipynb ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import pandas as pd\n",
11
+ "import numpy as np\n",
12
+ "from functools import lru_cache\n",
13
+ "from concurrent.futures import ThreadPoolExecutor\n",
14
+ "import promptquality as pq\n",
15
+ "from dotenv import load_dotenv\n",
16
+ "from data_loader import DATASETS, load_data\n",
17
+ "from tqdm.auto import tqdm\n",
18
+ "\n",
19
+ "load_dotenv()\n",
20
+ "pq.login(\"https://console.demo.rungalileo.io\")"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "project_name = \"agent-lb-v1\"\n",
30
+ "PROJECT_ID = pq.get_project_from_name(project_name).id\n",
31
+ "\n",
32
+ "\n",
33
+ "@lru_cache(maxsize=1000)\n",
34
+ "def get_output_df(model, dataset):\n",
35
+ " print(f\"Getting metrics for {model} {project_name} for dataset {dataset}\")\n",
36
+ " run_name = f\"{model} {dataset}\"\n",
37
+ " run_id = pq.get_run_from_name(run_name, PROJECT_ID).id\n",
38
+ " rows = pq.get_rows(\n",
39
+ " project_id=PROJECT_ID,\n",
40
+ " run_id=run_id,\n",
41
+ " task_type=None,\n",
42
+ " config=None,\n",
43
+ " starting_token=0,\n",
44
+ " limit=1000,\n",
45
+ " )\n",
46
+ "\n",
47
+ " rationales = [d.metrics.tool_selection_quality_rationale for d in rows]\n",
48
+ "\n",
49
+ " scores = [\n",
50
+ " round(d.metrics.tool_selection_quality, 2)\n",
51
+ " for d, rationale in zip(rows, rationales)\n",
52
+ " if rationale\n",
53
+ " ]\n",
54
+ " \n",
55
+ " explanations = [\n",
56
+ " d.metrics.tool_selection_quality_explanation\n",
57
+ " for d, rationale in zip(rows, rationales)\n",
58
+ " if rationale\n",
59
+ " ]\n",
60
+ " \n",
61
+ " responses = [d.response for d, rationale in zip(rows, rationales)\n",
62
+ " if rationale\n",
63
+ " ]\n",
64
+ " \n",
65
+ " rationales = [r for r in rationales if r]\n",
66
+ " mean_score = round(np.mean(scores), 2)\n",
67
+ " \n",
68
+ " data = {\n",
69
+ " \"response\": responses,\n",
70
+ " \"mean_score\": mean_score,\n",
71
+ " \"score\": scores,\n",
72
+ " \"rationale\": rationales,\n",
73
+ " \"explanation\": explanations,\n",
74
+ " }\n",
75
+ " return pd.DataFrame(data)\n",
76
+ "\n",
77
+ "def save_output_df(df, model, dataset):\n",
78
+ " os.makedirs(f\"output/{model}\", exist_ok=True)\n",
79
+ " df.to_parquet(f\"output/{model}/{dataset}.parquet\")\n",
80
+ "\n",
81
+ "def get_updated_df(df, df_output):\n",
82
+ " df = df.iloc[:len(df_output)].copy()\n",
83
+ " \n",
84
+ " df[\"response\"] = df_output[\"response\"].tolist()\n",
85
+ " df[\"rationale\"] = df_output[\"rationale\"].tolist()\n",
86
+ " df[\"explanation\"] = df_output[\"explanation\"].tolist()\n",
87
+ " df[\"score\"] = df_output[\"score\"].tolist()\n",
88
+ " cols = ['conversation', 'tools_langchain', 'n_turns',\n",
89
+ " 'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']\n",
90
+ " return df[cols]\n",
91
+ "\n",
92
+ "\n",
93
+ "def get_chat_and_score_df(model, dataset):\n",
94
+ " df_output = pd.read_parquet(f\"output/{model}/{dataset}.parquet\")\n",
95
+ " df = pd.read_parquet(f\"datasets/{dataset}.parquet\")\n",
96
+ " df = get_updated_df(df, df_output)\n",
97
+ " return df"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "def process_dataset(args):\n",
107
+ " model, dataset = args\n",
108
+ " if os.path.exists(f\"output/{model}/{dataset}.parquet\"):\n",
109
+ " return None\n",
110
+ " print(model, dataset)\n",
111
+ " df_output = get_output_df(model, dataset)\n",
112
+ " save_output_df(df_output, model, dataset)\n",
113
+ " return f\"Completed: {model} - {dataset}\"\n",
114
+ "\n",
115
+ "def process_model_datasets(model, datasets, max_workers=5):\n",
116
+ " with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
117
+ " # Create arguments list for each dataset\n",
118
+ " args_list = [(model, dataset) for dataset in datasets]\n",
119
+ " \n",
120
+ " # Process datasets in parallel with progress bar\n",
121
+ " list(tqdm(\n",
122
+ " executor.map(process_dataset, args_list),\n",
123
+ " total=len(datasets),\n",
124
+ " desc=f\"Datasets ({model})\",\n",
125
+ " position=1,\n",
126
+ " leave=False\n",
127
+ " ))\n",
128
+ "\n",
129
+ "\n",
130
+ "models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n",
131
+ "# models = load_data()[\"Model\"]\n",
132
+ "\n",
133
+ "# Process each model sequentially, but datasets in parallel\n",
134
+ "for model in tqdm(models, desc=\"Models\", position=0):\n",
135
+ " process_model_datasets(model, DATASETS)\n"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": []
144
+ }
145
+ ],
146
+ "metadata": {
147
+ "kernelspec": {
148
+ "display_name": "langgraph",
149
+ "language": "python",
150
+ "name": "python3"
151
+ },
152
+ "language_info": {
153
+ "codemirror_mode": {
154
+ "name": "ipython",
155
+ "version": 3
156
+ },
157
+ "file_extension": ".py",
158
+ "mimetype": "text/x-python",
159
+ "name": "python",
160
+ "nbconvert_exporter": "python",
161
+ "pygments_lexer": "ipython3",
162
+ "version": "3.12.6"
163
+ }
164
+ },
165
+ "nbformat": 4,
166
+ "nbformat_minor": 2
167
+ }
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet ADDED
Binary file (36.4 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (25.4 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (22.9 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (42.4 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (38 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (41.6 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (42.7 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet ADDED
Binary file (47.1 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet ADDED
Binary file (13.1 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet ADDED
Binary file (11.5 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (104 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet ADDED
Binary file (39.3 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet ADDED
Binary file (30.4 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet ADDED
Binary file (43.8 kB). View file
 
output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet ADDED
Binary file (49.4 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet ADDED
Binary file (41.5 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (28.8 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (24.3 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (59 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (45.7 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (49.5 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (45.8 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet ADDED
Binary file (106 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet ADDED
Binary file (18.3 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet ADDED
Binary file (14.7 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (103 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet ADDED
Binary file (39.9 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet ADDED
Binary file (30.6 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet ADDED
Binary file (45 kB). View file
 
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet ADDED
Binary file (75.5 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet ADDED
Binary file (56.5 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
Binary file (25.8 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
Binary file (24.7 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet ADDED
Binary file (50.6 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet ADDED
Binary file (40.6 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet ADDED
Binary file (49 kB). View file
 
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet ADDED
Binary file (49.8 kB). View file
 
output/claude-3-5-haiku-20241022/tau_long_context.parquet ADDED
Binary file (42.4 kB). View file
 
output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet ADDED
Binary file (19.9 kB). View file
 
output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet ADDED
Binary file (13.8 kB). View file
 
output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet ADDED
Binary file (89.3 kB). View file
 
output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet ADDED
Binary file (40.8 kB). View file
 
output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet ADDED
Binary file (27.3 kB). View file
 
output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet ADDED
Binary file (49.3 kB). View file
 
output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet ADDED
Binary file (56.6 kB). View file
 
output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet ADDED
Binary file (47.4 kB). View file