Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
b0ce6f5
1
Parent(s):
df66c39
added data exploration
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +7 -8
- chat.py +266 -115
- data_loader.py +19 -5
- get_exp_data.ipynb +167 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
- output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
- output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet +0 -0
- output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet +0 -0
- output/claude-3-5-haiku-20241022/tau_long_context.parquet +0 -0
- output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet +0 -0
- output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet +0 -0
- output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet +0 -0
- output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet +0 -0
- output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet +0 -0
app.py
CHANGED
@@ -3,7 +3,6 @@ import promptquality as pq
|
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
load_dotenv()
|
6 |
-
pq.login("https://console.demo.rungalileo.io")
|
7 |
|
8 |
from data_loader import (
|
9 |
load_data,
|
@@ -36,9 +35,9 @@ def create_app():
|
|
36 |
|
37 |
mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
|
43 |
# Initial loads
|
44 |
app.load(
|
@@ -55,10 +54,10 @@ def create_app():
|
|
55 |
outputs=[mc_info, mc_plot],
|
56 |
)
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
|
63 |
return app
|
64 |
|
|
|
3 |
from dotenv import load_dotenv
|
4 |
|
5 |
load_dotenv()
|
|
|
6 |
|
7 |
from data_loader import (
|
8 |
load_data,
|
|
|
35 |
|
36 |
mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
|
37 |
|
38 |
+
exp_outputs = create_exploration_tab(
|
39 |
+
df, MODELS, DATASETS, SCORES, HEADER_CONTENT
|
40 |
+
)
|
41 |
|
42 |
# Initial loads
|
43 |
app.load(
|
|
|
54 |
outputs=[mc_info, mc_plot],
|
55 |
)
|
56 |
|
57 |
+
app.load(
|
58 |
+
fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], 0, 1, 0),
|
59 |
+
outputs=exp_outputs,
|
60 |
+
)
|
61 |
|
62 |
return app
|
63 |
|
chat.py
CHANGED
@@ -1,199 +1,350 @@
|
|
1 |
-
# chat.py
|
2 |
import gradio as gr
|
3 |
-
import json
|
4 |
import pandas as pd
|
5 |
-
import
|
6 |
-
from functools import lru_cache
|
7 |
-
import promptquality as pq
|
8 |
-
|
9 |
-
project_name = "agent-lb-v1"
|
10 |
-
PROJECT_ID = pq.get_project_from_name(project_name).id
|
11 |
-
|
12 |
-
|
13 |
-
@lru_cache(maxsize=1000)
|
14 |
-
def get_model_score_for_dataset(model, dataset):
|
15 |
-
print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
|
16 |
-
run_name = f"{model} {dataset}"
|
17 |
-
run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
|
18 |
-
rows = pq.get_rows(
|
19 |
-
project_id=PROJECT_ID,
|
20 |
-
run_id=run_id,
|
21 |
-
task_type=None,
|
22 |
-
config=None,
|
23 |
-
starting_token=0,
|
24 |
-
limit=1000,
|
25 |
-
)
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
]
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
]
|
38 |
-
|
39 |
-
mean_score = round(np.mean(scores), 2)
|
40 |
-
return {
|
41 |
-
"mean_score": mean_score,
|
42 |
-
"scores": scores,
|
43 |
-
"rationales": rationales,
|
44 |
-
"explanations": explanations,
|
45 |
-
}
|
46 |
-
|
47 |
-
|
48 |
-
def get_updated_df(df, data):
|
49 |
-
df["rationale"] = data["rationales"]
|
50 |
-
df["explanation"] = data["explanations"]
|
51 |
-
df["score"] = data["scores"]
|
52 |
-
return df
|
53 |
|
54 |
|
55 |
def get_chat_and_score_df(model, dataset):
|
56 |
-
|
57 |
df = pd.read_parquet(f"datasets/{dataset}.parquet")
|
58 |
-
df = get_updated_df(df,
|
59 |
return df
|
60 |
|
61 |
|
62 |
-
def format_chat_message(role, content):
|
63 |
-
"""Format individual chat messages with
|
64 |
role_style = role.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return f"""
|
66 |
-
<div
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
</div>
|
70 |
"""
|
71 |
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def format_tool_info(tools):
|
74 |
-
"""Format tool information with
|
75 |
if isinstance(tools, str):
|
76 |
try:
|
77 |
tools = json.loads(tools)
|
78 |
except:
|
79 |
-
return
|
80 |
|
81 |
if not tools:
|
82 |
-
return
|
83 |
|
84 |
tool_html = ""
|
85 |
for tool in tools:
|
|
|
86 |
tool_html += f"""
|
87 |
-
<div
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
</div>
|
93 |
</div>
|
94 |
"""
|
95 |
-
return f
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def format_parameters(parameters):
|
99 |
if not parameters:
|
100 |
-
return
|
101 |
|
102 |
params_html = ""
|
103 |
for name, desc in parameters.items():
|
104 |
params_html += f"""
|
105 |
-
<div
|
106 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
</div>
|
108 |
"""
|
109 |
return params_html
|
110 |
|
111 |
-
|
112 |
def format_metrics(score, rationale, explanation):
|
113 |
-
"""Format metrics display with
|
|
|
|
|
|
|
|
|
|
|
114 |
return f"""
|
115 |
-
<div
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
</div>
|
120 |
-
<div
|
121 |
-
<h3
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
</div>
|
124 |
-
<div
|
125 |
-
<h3
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
</div>
|
128 |
</div>
|
129 |
"""
|
130 |
|
131 |
-
|
132 |
def update_chat_display(df, index):
|
133 |
-
"""Update the chat visualization
|
134 |
if df is None or df.empty or index >= len(df):
|
135 |
return (
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
)
|
140 |
|
141 |
row = df.iloc[index]
|
142 |
|
143 |
-
# Format chat messages
|
144 |
messages = json.loads(row["conversation"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
chat_html = f"""
|
146 |
-
<div
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
149 |
</div>
|
150 |
"""
|
151 |
|
152 |
-
# Format metrics
|
153 |
metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
|
154 |
-
|
155 |
-
# Format tool info
|
156 |
tool_html = format_tool_info(row["tools_langchain"])
|
157 |
|
158 |
return chat_html, metrics_html, tool_html
|
159 |
|
160 |
|
161 |
-
def filter_and_update_display(model, dataset,
|
162 |
try:
|
163 |
-
# Get data and filter by scores
|
164 |
df_chat = get_chat_and_score_df(model, dataset)
|
165 |
-
|
166 |
-
df_chat
|
|
|
167 |
|
168 |
if df_chat.empty:
|
169 |
return (
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
gr.update(maximum=0, value=0),
|
174 |
"0/0",
|
175 |
)
|
176 |
|
177 |
-
# Update index bounds
|
178 |
max_index = len(df_chat) - 1
|
179 |
current_index = min(current_index, max_index)
|
180 |
-
|
181 |
-
# Get displays for current index
|
182 |
chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
tool_html,
|
188 |
-
gr.update(maximum=max_index, value=current_index),
|
189 |
-
f"{current_index + 1}/{len(df_chat)}",
|
190 |
-
)
|
191 |
except Exception as e:
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
return (
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
gr.update(maximum=0, value=0),
|
198 |
"0/0",
|
199 |
)
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import pandas as pd
|
3 |
+
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
|
6 |
+
def get_updated_df(df, df_output):
|
7 |
+
df = df.iloc[: len(df_output)].copy()
|
8 |
+
df["response"] = df_output["response"].tolist()
|
9 |
+
df["rationale"] = df_output["rationale"].tolist()
|
10 |
+
df["explanation"] = df_output["explanation"].tolist()
|
11 |
+
df["score"] = df_output["score"].tolist()
|
12 |
+
cols = [
|
13 |
+
"conversation",
|
14 |
+
"tools_langchain",
|
15 |
+
"n_turns",
|
16 |
+
"len_query",
|
17 |
+
"n_tools",
|
18 |
+
"response",
|
19 |
+
"rationale",
|
20 |
+
"explanation",
|
21 |
+
"score",
|
22 |
]
|
23 |
+
return df[cols]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def get_chat_and_score_df(model, dataset):
|
27 |
+
df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
|
28 |
df = pd.read_parquet(f"datasets/{dataset}.parquet")
|
29 |
+
df = get_updated_df(df, df_output)
|
30 |
return df
|
31 |
|
32 |
|
33 |
+
def format_chat_message(role, content, is_response=False):
|
34 |
+
"""Format individual chat messages with alignment based on role."""
|
35 |
role_style = role.lower()
|
36 |
+
alignment = "flex-end" if role_style == "user" else "flex-start"
|
37 |
+
max_width = "80%"
|
38 |
+
|
39 |
+
# Clean up any excessive whitespace while preserving intentional line breaks
|
40 |
+
cleaned_content = "\n".join(line.strip() for line in content.split("\n"))
|
41 |
+
|
42 |
+
background_color = (
|
43 |
+
"var(--response-bg)" if is_response else f"var(--message-bg-{role_style})"
|
44 |
+
)
|
45 |
+
|
46 |
return f"""
|
47 |
+
<div style="
|
48 |
+
display: flex;
|
49 |
+
justify-content: {alignment};
|
50 |
+
margin: 0.75rem 0;">
|
51 |
+
<div style="
|
52 |
+
max-width: {max_width};
|
53 |
+
padding: 1rem;
|
54 |
+
border-radius: 12px;
|
55 |
+
background-color: {background_color};
|
56 |
+
border: 1px solid var(--border-color);
|
57 |
+
box-shadow: 0 1px 2px var(--shadow-color);">
|
58 |
+
<div style="
|
59 |
+
font-weight: 600;
|
60 |
+
color: var(--primary-text);
|
61 |
+
margin-bottom: 0.5rem;
|
62 |
+
font-size: 0.9rem;
|
63 |
+
text-transform: uppercase;">
|
64 |
+
{role + (" Response" if is_response else "")}
|
65 |
+
</div>
|
66 |
+
<div style="
|
67 |
+
color: var(--text-color);
|
68 |
+
line-height: 1.6;
|
69 |
+
white-space: pre-wrap;
|
70 |
+
font-family: {is_response and 'monospace' or 'inherit'};
|
71 |
+
font-size: {is_response and '0.9rem' or 'inherit'};">
|
72 |
+
{cleaned_content}
|
73 |
+
</div>
|
74 |
+
</div>
|
75 |
</div>
|
76 |
"""
|
77 |
|
78 |
|
79 |
+
def format_response(response):
|
80 |
+
"""Format the response data, handling both JSON and text."""
|
81 |
+
try:
|
82 |
+
# Try to parse as JSON
|
83 |
+
response_data = json.loads(response)
|
84 |
+
# Format JSON response nicely
|
85 |
+
formatted_response = json.dumps(response_data, indent=2)
|
86 |
+
except (json.JSONDecodeError, TypeError):
|
87 |
+
# If not JSON, use as is
|
88 |
+
formatted_response = str(response)
|
89 |
+
|
90 |
+
return formatted_response
|
91 |
+
|
92 |
+
|
93 |
+
def parse_tool_schema(tool):
|
94 |
+
"""Parse tool schema to extract name, description, and parameters properly."""
|
95 |
+
name = tool.get("title", "Unnamed Tool")
|
96 |
+
description = tool.get("description", "No description available")
|
97 |
+
|
98 |
+
parameters = {}
|
99 |
+
if "properties" in tool:
|
100 |
+
for param_name, param_data in tool["properties"].items():
|
101 |
+
param_desc = param_data.get("description", "No description")
|
102 |
+
param_type = param_data.get("type", "unknown")
|
103 |
+
parameters[param_name] = f"{param_desc} (Type: {param_type})"
|
104 |
+
|
105 |
+
return name, description, parameters
|
106 |
+
|
107 |
+
|
108 |
def format_tool_info(tools):
|
109 |
+
"""Format tool information with improved schema parsing and dark theme support."""
|
110 |
if isinstance(tools, str):
|
111 |
try:
|
112 |
tools = json.loads(tools)
|
113 |
except:
|
114 |
+
return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
|
115 |
|
116 |
if not tools:
|
117 |
+
return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
|
118 |
|
119 |
tool_html = ""
|
120 |
for tool in tools:
|
121 |
+
name, description, parameters = parse_tool_schema(tool)
|
122 |
tool_html += f"""
|
123 |
+
<div style="
|
124 |
+
margin: 1rem 0;
|
125 |
+
padding: 1.5rem;
|
126 |
+
border-radius: 8px;
|
127 |
+
background-color: var(--surface-color);
|
128 |
+
border: 1px solid var(--border-color);">
|
129 |
+
<div style="
|
130 |
+
font-weight: 600;
|
131 |
+
color: var(--primary-text);
|
132 |
+
margin-bottom: 0.75rem;
|
133 |
+
font-size: 1.1rem;">
|
134 |
+
{name}
|
135 |
+
</div>
|
136 |
+
<div style="
|
137 |
+
color: var(--text-color);
|
138 |
+
margin-bottom: 1rem;
|
139 |
+
line-height: 1.5;">
|
140 |
+
{description}
|
141 |
+
</div>
|
142 |
+
<div style="
|
143 |
+
background-color: var(--surface-color-alt);
|
144 |
+
padding: 1rem;
|
145 |
+
border-radius: 4px;
|
146 |
+
border: 1px solid var(--border-color);">
|
147 |
+
{format_parameters(parameters)}
|
148 |
</div>
|
149 |
</div>
|
150 |
"""
|
151 |
+
return f"""
|
152 |
+
<div style="
|
153 |
+
max-height: 600px;
|
154 |
+
overflow-y: auto;
|
155 |
+
padding-right: 0.5rem;">
|
156 |
+
<style>
|
157 |
+
:root[data-theme="light"] {{
|
158 |
+
--surface-color: #f8f9fa;
|
159 |
+
--surface-color-alt: #ffffff;
|
160 |
+
--text-color: #202124;
|
161 |
+
--text-muted: #666666;
|
162 |
+
--primary-text: #1a73e8;
|
163 |
+
--border-color: #e9ecef;
|
164 |
+
--shadow-color: rgba(0,0,0,0.1);
|
165 |
+
--message-bg-user: #E5F6FD;
|
166 |
+
--message-bg-assistant: #F7F7F8;
|
167 |
+
--message-bg-system: #FFF3E0;
|
168 |
+
--score-high: #1a73e8;
|
169 |
+
--score-med: #f4b400;
|
170 |
+
--score-low: #ea4335;
|
171 |
+
}}
|
172 |
+
|
173 |
+
:root[data-theme="dark"] {{
|
174 |
+
--surface-color: #1e1e1e;
|
175 |
+
--surface-color-alt: #2d2d2d;
|
176 |
+
--text-color: #ffffff;
|
177 |
+
--text-muted: #a0a0a0;
|
178 |
+
--primary-text: #60a5fa;
|
179 |
+
--border-color: #404040;
|
180 |
+
--shadow-color: rgba(0,0,0,0.3);
|
181 |
+
--message-bg-user: #2d3748;
|
182 |
+
--message-bg-assistant: #1a1a1a;
|
183 |
+
--message-bg-system: #2c2516;
|
184 |
+
--response-bg: #2a2f3a;
|
185 |
+
--score-high: #60a5fa;
|
186 |
+
--score-med: #fbbf24;
|
187 |
+
--score-low: #ef4444;
|
188 |
+
}}
|
189 |
+
</style>
|
190 |
+
{tool_html}
|
191 |
+
</div>
|
192 |
+
"""
|
193 |
|
194 |
def format_parameters(parameters):
|
195 |
if not parameters:
|
196 |
+
return '<div style="color: var(--text-muted);">No parameters</div>'
|
197 |
|
198 |
params_html = ""
|
199 |
for name, desc in parameters.items():
|
200 |
params_html += f"""
|
201 |
+
<div style="margin: 0.75rem 0;">
|
202 |
+
<div style="
|
203 |
+
font-weight: 500;
|
204 |
+
color: var(--primary-text);
|
205 |
+
margin-bottom: 0.25rem;">
|
206 |
+
{name}
|
207 |
+
</div>
|
208 |
+
<div style="
|
209 |
+
color: var(--text-color);
|
210 |
+
line-height: 1.4;
|
211 |
+
font-size: 0.95rem;">
|
212 |
+
{desc}
|
213 |
+
</div>
|
214 |
</div>
|
215 |
"""
|
216 |
return params_html
|
217 |
|
|
|
218 |
def format_metrics(score, rationale, explanation):
|
219 |
+
"""Format metrics display with improved dark theme support."""
|
220 |
+
score_color = (
|
221 |
+
"var(--score-high)"
|
222 |
+
if score >= 0.7
|
223 |
+
else "var(--score-med)" if score >= 0.4 else "var(--score-low)"
|
224 |
+
)
|
225 |
return f"""
|
226 |
+
<div style="
|
227 |
+
padding: 1.5rem;
|
228 |
+
background-color: var(--surface-color);
|
229 |
+
border-radius: 8px;
|
230 |
+
border: 1px solid var(--border-color);
|
231 |
+
box-shadow: 0 2px 4px var(--shadow-color);">
|
232 |
+
<div style="margin-bottom: 1.5rem;">
|
233 |
+
<h3 style="
|
234 |
+
color: var(--text-color);
|
235 |
+
font-size: 1.1rem;
|
236 |
+
margin-bottom: 0.5rem;
|
237 |
+
font-weight: 600;">TSQ Score</h3>
|
238 |
+
<div style="
|
239 |
+
font-size: 2rem;
|
240 |
+
font-weight: 600;
|
241 |
+
color: {score_color};">
|
242 |
+
{score:.2f}
|
243 |
+
</div>
|
244 |
</div>
|
245 |
+
<div style="margin-bottom: 1.5rem;">
|
246 |
+
<h3 style="
|
247 |
+
color: var(--text-color);
|
248 |
+
font-size: 1.1rem;
|
249 |
+
margin-bottom: 0.5rem;
|
250 |
+
font-weight: 600;">Rationale</h3>
|
251 |
+
<div style="
|
252 |
+
color: var(--text-color);
|
253 |
+
line-height: 1.5;">
|
254 |
+
{rationale}
|
255 |
+
</div>
|
256 |
</div>
|
257 |
+
<div>
|
258 |
+
<h3 style="
|
259 |
+
color: var(--text-color);
|
260 |
+
font-size: 1.1rem;
|
261 |
+
margin-bottom: 0.5rem;
|
262 |
+
font-weight: 600;">Explanation</h3>
|
263 |
+
<div style="
|
264 |
+
color: var(--text-color);
|
265 |
+
line-height: 1.5;">
|
266 |
+
{explanation}
|
267 |
+
</div>
|
268 |
</div>
|
269 |
</div>
|
270 |
"""
|
271 |
|
|
|
272 |
def update_chat_display(df, index):
|
273 |
+
"""Update the chat visualization with improved dark theme support."""
|
274 |
if df is None or df.empty or index >= len(df):
|
275 |
return (
|
276 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No data available</div>',
|
277 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
|
278 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
|
279 |
)
|
280 |
|
281 |
row = df.iloc[index]
|
282 |
|
|
|
283 |
messages = json.loads(row["conversation"])
|
284 |
+
response = row["response"]
|
285 |
+
formatted_response = format_response(response)
|
286 |
+
|
287 |
+
# Create list of all messages including the response
|
288 |
+
all_messages = [
|
289 |
+
format_chat_message(msg["role"], msg["content"]) for msg in messages
|
290 |
+
]
|
291 |
+
all_messages.append(
|
292 |
+
format_chat_message("Assistant", formatted_response, is_response=True)
|
293 |
+
)
|
294 |
+
|
295 |
chat_html = f"""
|
296 |
+
<div style="
|
297 |
+
background-color: var(--surface-color);
|
298 |
+
border-radius: 8px;
|
299 |
+
border: 1px solid var(--border-color);
|
300 |
+
box-shadow: 0 2px 4px var(--shadow-color);
|
301 |
+
padding: 1.5rem;">
|
302 |
+
{"".join(all_messages)}
|
303 |
</div>
|
304 |
"""
|
305 |
|
|
|
306 |
metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
|
|
|
|
|
307 |
tool_html = format_tool_info(row["tools_langchain"])
|
308 |
|
309 |
return chat_html, metrics_html, tool_html
|
310 |
|
311 |
|
312 |
+
def filter_and_update_display(model, dataset, min_score, max_score, current_index):
|
313 |
try:
|
|
|
314 |
df_chat = get_chat_and_score_df(model, dataset)
|
315 |
+
df_chat = df_chat[
|
316 |
+
(df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
|
317 |
+
]
|
318 |
|
319 |
if df_chat.empty:
|
320 |
return (
|
321 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No data available for selected filters</div>',
|
322 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
|
323 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
|
|
|
324 |
"0/0",
|
325 |
)
|
326 |
|
|
|
327 |
max_index = len(df_chat) - 1
|
328 |
current_index = min(current_index, max_index)
|
|
|
|
|
329 |
chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
|
330 |
|
331 |
+
index_display = f'<div style="font-weight: 500; color: var(--primary-text);">{current_index + 1}/{len(df_chat)}</div>'
|
332 |
+
return chat_html, metrics_html, tool_html, index_display
|
333 |
+
|
|
|
|
|
|
|
|
|
334 |
except Exception as e:
|
335 |
+
error_html = f"""
|
336 |
+
<div style="
|
337 |
+
padding: 1rem;
|
338 |
+
color: var(--score-low);
|
339 |
+
background-color: var(--surface-color);
|
340 |
+
border: 1px solid var(--score-low);
|
341 |
+
border-radius: 4px;">
|
342 |
+
Error: {str(e)}
|
343 |
+
</div>
|
344 |
+
"""
|
345 |
return (
|
346 |
+
error_html,
|
347 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
|
348 |
+
'<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
|
|
|
349 |
"0/0",
|
350 |
)
|
data_loader.py
CHANGED
@@ -1,11 +1,25 @@
|
|
1 |
import pandas as pd
|
2 |
-
from glob import glob
|
3 |
-
import numpy as np
|
4 |
-
from pathlib import Path
|
5 |
|
6 |
|
7 |
-
DATASETS = [
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def load_data():
|
11 |
"""Load and preprocess the data."""
|
|
|
1 |
import pandas as pd
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
+
DATASETS = [
|
5 |
+
"BFCL_v3_irrelevance",
|
6 |
+
"BFCL_v3_multi_turn_base_multi_func_call",
|
7 |
+
"BFCL_v3_multi_turn_base_single_func_call",
|
8 |
+
"BFCL_v3_multi_turn_composite",
|
9 |
+
"BFCL_v3_multi_turn_long_context",
|
10 |
+
"BFCL_v3_multi_turn_miss_func",
|
11 |
+
"BFCL_v3_multi_turn_miss_param",
|
12 |
+
"tau_long_context",
|
13 |
+
"toolace_single_func_call_1",
|
14 |
+
"toolace_single_func_call_2",
|
15 |
+
"xlam_multiple_tool_multiple_call",
|
16 |
+
"xlam_multiple_tool_single_call",
|
17 |
+
"xlam_single_tool_multiple_call",
|
18 |
+
"xlam_single_tool_single_call",
|
19 |
+
"xlam_tool_miss",
|
20 |
+
]
|
21 |
+
|
22 |
+
SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
23 |
|
24 |
def load_data():
|
25 |
"""Load and preprocess the data."""
|
get_exp_data.ipynb
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"import pandas as pd\n",
|
11 |
+
"import numpy as np\n",
|
12 |
+
"from functools import lru_cache\n",
|
13 |
+
"from concurrent.futures import ThreadPoolExecutor\n",
|
14 |
+
"import promptquality as pq\n",
|
15 |
+
"from dotenv import load_dotenv\n",
|
16 |
+
"from data_loader import DATASETS, load_data\n",
|
17 |
+
"from tqdm.auto import tqdm\n",
|
18 |
+
"\n",
|
19 |
+
"load_dotenv()\n",
|
20 |
+
"pq.login(\"https://console.demo.rungalileo.io\")"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"project_name = \"agent-lb-v1\"\n",
|
30 |
+
"PROJECT_ID = pq.get_project_from_name(project_name).id\n",
|
31 |
+
"\n",
|
32 |
+
"\n",
|
33 |
+
"@lru_cache(maxsize=1000)\n",
|
34 |
+
"def get_output_df(model, dataset):\n",
|
35 |
+
" print(f\"Getting metrics for {model} {project_name} for dataset {dataset}\")\n",
|
36 |
+
" run_name = f\"{model} {dataset}\"\n",
|
37 |
+
" run_id = pq.get_run_from_name(run_name, PROJECT_ID).id\n",
|
38 |
+
" rows = pq.get_rows(\n",
|
39 |
+
" project_id=PROJECT_ID,\n",
|
40 |
+
" run_id=run_id,\n",
|
41 |
+
" task_type=None,\n",
|
42 |
+
" config=None,\n",
|
43 |
+
" starting_token=0,\n",
|
44 |
+
" limit=1000,\n",
|
45 |
+
" )\n",
|
46 |
+
"\n",
|
47 |
+
" rationales = [d.metrics.tool_selection_quality_rationale for d in rows]\n",
|
48 |
+
"\n",
|
49 |
+
" scores = [\n",
|
50 |
+
" round(d.metrics.tool_selection_quality, 2)\n",
|
51 |
+
" for d, rationale in zip(rows, rationales)\n",
|
52 |
+
" if rationale\n",
|
53 |
+
" ]\n",
|
54 |
+
" \n",
|
55 |
+
" explanations = [\n",
|
56 |
+
" d.metrics.tool_selection_quality_explanation\n",
|
57 |
+
" for d, rationale in zip(rows, rationales)\n",
|
58 |
+
" if rationale\n",
|
59 |
+
" ]\n",
|
60 |
+
" \n",
|
61 |
+
" responses = [d.response for d, rationale in zip(rows, rationales)\n",
|
62 |
+
" if rationale\n",
|
63 |
+
" ]\n",
|
64 |
+
" \n",
|
65 |
+
" rationales = [r for r in rationales if r]\n",
|
66 |
+
" mean_score = round(np.mean(scores), 2)\n",
|
67 |
+
" \n",
|
68 |
+
" data = {\n",
|
69 |
+
" \"response\": responses,\n",
|
70 |
+
" \"mean_score\": mean_score,\n",
|
71 |
+
" \"score\": scores,\n",
|
72 |
+
" \"rationale\": rationales,\n",
|
73 |
+
" \"explanation\": explanations,\n",
|
74 |
+
" }\n",
|
75 |
+
" return pd.DataFrame(data)\n",
|
76 |
+
"\n",
|
77 |
+
"def save_output_df(df, model, dataset):\n",
|
78 |
+
" os.makedirs(f\"output/{model}\", exist_ok=True)\n",
|
79 |
+
" df.to_parquet(f\"output/{model}/{dataset}.parquet\")\n",
|
80 |
+
"\n",
|
81 |
+
"def get_updated_df(df, df_output):\n",
|
82 |
+
" df = df.iloc[:len(df_output)].copy()\n",
|
83 |
+
" \n",
|
84 |
+
" df[\"response\"] = df_output[\"response\"].tolist()\n",
|
85 |
+
" df[\"rationale\"] = df_output[\"rationale\"].tolist()\n",
|
86 |
+
" df[\"explanation\"] = df_output[\"explanation\"].tolist()\n",
|
87 |
+
" df[\"score\"] = df_output[\"score\"].tolist()\n",
|
88 |
+
" cols = ['conversation', 'tools_langchain', 'n_turns',\n",
|
89 |
+
" 'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']\n",
|
90 |
+
" return df[cols]\n",
|
91 |
+
"\n",
|
92 |
+
"\n",
|
93 |
+
"def get_chat_and_score_df(model, dataset):\n",
|
94 |
+
" df_output = pd.read_parquet(f\"output/{model}/{dataset}.parquet\")\n",
|
95 |
+
" df = pd.read_parquet(f\"datasets/{dataset}.parquet\")\n",
|
96 |
+
" df = get_updated_df(df, df_output)\n",
|
97 |
+
" return df"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": null,
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [],
|
105 |
+
"source": [
|
106 |
+
"def process_dataset(args):\n",
|
107 |
+
" model, dataset = args\n",
|
108 |
+
" if os.path.exists(f\"output/{model}/{dataset}.parquet\"):\n",
|
109 |
+
" return None\n",
|
110 |
+
" print(model, dataset)\n",
|
111 |
+
" df_output = get_output_df(model, dataset)\n",
|
112 |
+
" save_output_df(df_output, model, dataset)\n",
|
113 |
+
" return f\"Completed: {model} - {dataset}\"\n",
|
114 |
+
"\n",
|
115 |
+
"def process_model_datasets(model, datasets, max_workers=5):\n",
|
116 |
+
" with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
|
117 |
+
" # Create arguments list for each dataset\n",
|
118 |
+
" args_list = [(model, dataset) for dataset in datasets]\n",
|
119 |
+
" \n",
|
120 |
+
" # Process datasets in parallel with progress bar\n",
|
121 |
+
" list(tqdm(\n",
|
122 |
+
" executor.map(process_dataset, args_list),\n",
|
123 |
+
" total=len(datasets),\n",
|
124 |
+
" desc=f\"Datasets ({model})\",\n",
|
125 |
+
" position=1,\n",
|
126 |
+
" leave=False\n",
|
127 |
+
" ))\n",
|
128 |
+
"\n",
|
129 |
+
"\n",
|
130 |
+
"models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n",
|
131 |
+
"# models = load_data()[\"Model\"]\n",
|
132 |
+
"\n",
|
133 |
+
"# Process each model sequentially, but datasets in parallel\n",
|
134 |
+
"for model in tqdm(models, desc=\"Models\", position=0):\n",
|
135 |
+
" process_model_datasets(model, DATASETS)\n"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": null,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [],
|
143 |
+
"source": []
|
144 |
+
}
|
145 |
+
],
|
146 |
+
"metadata": {
|
147 |
+
"kernelspec": {
|
148 |
+
"display_name": "langgraph",
|
149 |
+
"language": "python",
|
150 |
+
"name": "python3"
|
151 |
+
},
|
152 |
+
"language_info": {
|
153 |
+
"codemirror_mode": {
|
154 |
+
"name": "ipython",
|
155 |
+
"version": 3
|
156 |
+
},
|
157 |
+
"file_extension": ".py",
|
158 |
+
"mimetype": "text/x-python",
|
159 |
+
"name": "python",
|
160 |
+
"nbconvert_exporter": "python",
|
161 |
+
"pygments_lexer": "ipython3",
|
162 |
+
"version": "3.12.6"
|
163 |
+
}
|
164 |
+
},
|
165 |
+
"nbformat": 4,
|
166 |
+
"nbformat_minor": 2
|
167 |
+
}
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet
ADDED
Binary file (36.4 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
Binary file (25.4 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
Binary file (22.9 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet
ADDED
Binary file (42.4 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet
ADDED
Binary file (38 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet
ADDED
Binary file (41.6 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet
ADDED
Binary file (42.7 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet
ADDED
Binary file (47.1 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet
ADDED
Binary file (13.1 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet
ADDED
Binary file (11.5 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet
ADDED
Binary file (104 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet
ADDED
Binary file (39.3 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet
ADDED
Binary file (30.4 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet
ADDED
Binary file (43.8 kB). View file
|
|
output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet
ADDED
Binary file (49.4 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet
ADDED
Binary file (41.5 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
Binary file (28.8 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
Binary file (24.3 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet
ADDED
Binary file (59 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet
ADDED
Binary file (45.7 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet
ADDED
Binary file (49.5 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet
ADDED
Binary file (45.8 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet
ADDED
Binary file (106 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet
ADDED
Binary file (18.3 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet
ADDED
Binary file (14.7 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet
ADDED
Binary file (103 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet
ADDED
Binary file (39.9 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet
ADDED
Binary file (30.6 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet
ADDED
Binary file (45 kB). View file
|
|
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet
ADDED
Binary file (75.5 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet
ADDED
Binary file (56.5 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
Binary file (25.8 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
Binary file (24.7 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet
ADDED
Binary file (50.6 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet
ADDED
Binary file (40.6 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet
ADDED
Binary file (49 kB). View file
|
|
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet
ADDED
Binary file (49.8 kB). View file
|
|
output/claude-3-5-haiku-20241022/tau_long_context.parquet
ADDED
Binary file (42.4 kB). View file
|
|
output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet
ADDED
Binary file (19.9 kB). View file
|
|
output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet
ADDED
Binary file (13.8 kB). View file
|
|
output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet
ADDED
Binary file (89.3 kB). View file
|
|
output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet
ADDED
Binary file (40.8 kB). View file
|
|
output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet
ADDED
Binary file (27.3 kB). View file
|
|
output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet
ADDED
Binary file (49.3 kB). View file
|
|
output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet
ADDED
Binary file (56.6 kB). View file
|
|
output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet
ADDED
Binary file (47.4 kB). View file
|
|