Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
DATASETS = [ | |
"BFCL_v3_irrelevance", | |
"BFCL_v3_multi_turn_base_multi_func_call", | |
"BFCL_v3_multi_turn_base_single_func_call", | |
"BFCL_v3_multi_turn_composite", | |
"BFCL_v3_multi_turn_long_context", | |
"BFCL_v3_multi_turn_miss_func", | |
"BFCL_v3_multi_turn_miss_param", | |
"tau_long_context", | |
"toolace_single_func_call_1", | |
"toolace_single_func_call_2", | |
"xlam_multiple_tool_multiple_call", | |
"xlam_multiple_tool_single_call", | |
"xlam_single_tool_multiple_call", | |
"xlam_single_tool_single_call", | |
"xlam_tool_miss", | |
] | |
SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] | |
def load_data(): | |
"""Load and preprocess the data.""" | |
df = pd.read_csv("results.csv").dropna() | |
# Add combined I/O cost column with 3:1 ratio | |
df["IO Cost"] = ( | |
df["Input cost per million token"] * 0.75 | |
+ df["Output cost per million token"] * 0.25 | |
) | |
return df | |
# categories.py | |
CATEGORIES = { | |
"Overall": ["Model Avg"], | |
"Overall single turn": ["single turn perf"], | |
"Overall multi turn": ["multi turn perf"], | |
"Single func call": [ | |
"xlam_single_tool_single_call", | |
"xlam_multiple_tool_single_call", | |
], | |
"Multiple func call": [ | |
"xlam_multiple_tool_multiple_call", | |
"xlam_single_tool_multiple_call", | |
"BFCL_v3_multi_turn_base_multi_func_call", | |
], | |
"Irrelevant query": ["BFCL_v3_irrelevance"], | |
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], | |
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], | |
"Missing params": ["BFCL_v3_multi_turn_miss_param"], | |
"Composite": ["BFCL_v3_multi_turn_composite"], | |
} | |
chat_css = """ | |
/* Container styles */ | |
.container { | |
display: flex; | |
gap: 1.5rem; | |
height: calc(100vh - 100px); | |
padding: 1rem; | |
} | |
/* Chat panel styles */ | |
.chat-panel { | |
flex: 2; | |
background: #1a1f2c; | |
border-radius: 1rem; | |
padding: 1rem; | |
overflow-y: auto; | |
max-height: calc(100vh - 120px); | |
} | |
/* Message styles */ | |
.message { | |
padding: 1.2rem; | |
margin: 0.8rem; | |
border-radius: 1rem; | |
font-family: monospace; | |
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | |
} | |
.system { | |
background: linear-gradient(135deg, #8e44ad, #9b59b6); | |
} | |
.user { | |
background: linear-gradient(135deg, #2c3e50, #3498db); | |
margin-left: 2rem; | |
} | |
.assistant { | |
background: linear-gradient(135deg, #27ae60, #2ecc71); | |
margin-right: 2rem; | |
} | |
.role-badge { | |
display: inline-block; | |
padding: 0.3rem 0.8rem; | |
border-radius: 0.5rem; | |
font-weight: bold; | |
margin-bottom: 0.8rem; | |
font-size: 0.9rem; | |
text-transform: uppercase; | |
letter-spacing: 0.05em; | |
} | |
.system-role { | |
background-color: #8e44ad; | |
color: white; | |
} | |
.user-role { | |
background-color: #3498db; | |
color: white; | |
} | |
.assistant-role { | |
background-color: #27ae60; | |
color: white; | |
} | |
.content { | |
white-space: pre-wrap; | |
word-break: break-word; | |
color: #f5f6fa; | |
line-height: 1.5; | |
} | |
/* Metrics panel styles */ | |
.metrics-panel { | |
flex: 1; | |
display: flex; | |
flex-direction: column; | |
gap: 2rem; | |
padding: 1.5rem; | |
background: #1a1f2c; | |
border-radius: 1rem; | |
} | |
.metric-section { | |
background: #1E293B; | |
padding: 1.5rem; | |
border-radius: 1rem; | |
} | |
.score-section { | |
text-align: center; | |
} | |
.score-display { | |
font-size: 3rem; | |
font-weight: bold; | |
color: #4ADE80; | |
line-height: 1; | |
margin: 0.5rem 0; | |
} | |
.explanation-text { | |
color: #E2E8F0; | |
line-height: 1.6; | |
font-size: 0.95rem; | |
} | |
/* Tool info panel styles */ | |
.tool-info-panel { | |
background: #1a1f2c; | |
padding: 1.5rem; | |
border-radius: 1rem; | |
color: #f5f6fa; | |
} | |
.tool-section { | |
margin-bottom: 1.5rem; | |
} | |
.tool-name { | |
font-size: 1.2rem; | |
color: #4ADE80; | |
font-weight: bold; | |
margin-bottom: 0.5rem; | |
} | |
.tool-description { | |
color: #E2E8F0; | |
line-height: 1.6; | |
margin-bottom: 1rem; | |
} | |
.tool-parameters .parameter { | |
margin: 0.5rem 0; | |
padding: 0.5rem; | |
background: rgba(255, 255, 255, 0.05); | |
border-radius: 0.5rem; | |
} | |
.param-name { | |
color: #63B3ED; | |
font-weight: bold; | |
margin-right: 0.5rem; | |
} | |
.tool-examples .example { | |
margin: 0.5rem 0; | |
padding: 0.5rem; | |
background: rgba(255, 255, 255, 0.05); | |
border-radius: 0.5rem; | |
font-family: monospace; | |
} | |
/* Custom scrollbar */ | |
::-webkit-scrollbar { | |
width: 8px; | |
} | |
::-webkit-scrollbar-track { | |
background: rgba(255, 255, 255, 0.1); | |
border-radius: 4px; | |
} | |
::-webkit-scrollbar-thumb { | |
background: linear-gradient(45deg, #3498db, #2ecc71); | |
border-radius: 4px; | |
} | |
/* Title styles */ | |
.title { | |
color: #63B3ED; | |
font-size: 2rem; | |
font-weight: bold; | |
text-align: center; | |
margin-bottom: 1.5rem; | |
padding: 1rem; | |
} | |
/* Headers */ | |
h3 { | |
color: #63B3ED; | |
margin: 0 0 1rem 0; | |
font-size: 1.1rem; | |
font-weight: 500; | |
letter-spacing: 0.05em; | |
} | |
""" | |
COMMON = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--bg-hover: rgba(30, 30, 45, 0.95); | |
--text-primary: #ffffff; | |
--text-secondary: #e2e8f0; | |
--text-tertiary: #e2e8f0; | |
--border-color: rgba(31, 41, 55, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--card-bg: rgba(17, 17, 27, 0.4); | |
--accent-color: #ffffff; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6); | |
--purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6); | |
--pink-gradient: linear-gradient(45deg, #F472B6, #EC4899); | |
--shadow-color: rgba(0, 0, 0, 0.2); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--bg-hover: rgba(229, 231, 235, 0.95); | |
--text-primary: #1F2937; | |
--text-secondary: #4B5563; | |
--text-tertiary: #6B7280; | |
--border-color: rgba(209, 213, 219, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--card-bg: rgba(249, 250, 251, 0.4); | |
--accent-color: #4F46E5; | |
--accent-bg: rgba(79, 70, 229, 0.1); | |
--blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB); | |
--purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD); | |
--pink-gradient: linear-gradient(45deg, #EC4899, #DB2777); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
</style> | |
""" | |
DESCRIPTION_HTML = """ | |
<div style=" | |
background: var(--bg-secondary, rgba(30, 30, 45, 0.95)); | |
border-radius: 12px; | |
padding: 24px; | |
margin: 16px 0; | |
"> | |
<div style=" | |
display: flex; | |
flex-direction: column; | |
gap: 16px; | |
"> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
display: flex; | |
align-items: center; | |
gap: 8px; | |
"> | |
🎯 Purpose | |
<span style=" | |
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple)); | |
color: white; | |
padding: 4px 12px; | |
border-radius: 100px; | |
font-size: 0.9rem; | |
">Latest Update: Feb 2025</span> | |
</div> | |
<p style=" | |
color: var(--text-secondary); | |
margin: 0; | |
line-height: 1.6; | |
"> | |
This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios. | |
</p> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
margin-top: 8px; | |
"> | |
🔍 What We Evaluate | |
</div> | |
<div style=" | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 16px; | |
color: var(--text-secondary); | |
"> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
🔄 Single/Multi-turn Interactions | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
🧩 Function Composition | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
⚡ Error Handling | |
</div> | |
</div> | |
<div style=" | |
color: var(--text-primary); | |
font-size: 1.1rem; | |
font-weight: 500; | |
margin-top: 8px; | |
"> | |
📊 Key Results | |
</div> | |
<div style=" | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
gap: 16px; | |
color: var(--text-secondary); | |
"> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
✅ Tool Selection Quality | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
💰 Open Vs Closed Source | |
</div> | |
<div style="display: flex; gap: 8px; align-items: center;"> | |
⚖️ Overall Effectiveness | |
</div> | |
</div> | |
</div> | |
</div> | |
""" | |
HEADER_CONTENT = ( | |
COMMON | |
+ """ | |
<style> | |
.header-wrapper { | |
background: var(--bg-primary); | |
padding: 4rem 2rem; | |
border-radius: 16px; | |
margin-bottom: 0; | |
transition: all 0.3s ease; | |
} | |
.header-content { | |
max-width: 72rem; | |
margin: 0 auto; | |
} | |
.title-section { | |
text-align: center; | |
margin-bottom: 4rem; | |
} | |
.title-gradient { | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.1; | |
background: var(--purple-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
margin-bottom: 0.5rem; | |
} | |
.subtitle-white { | |
font-size: 5rem; | |
font-weight: 800; | |
line-height: 1.1; | |
color: var(--text-primary); | |
margin-bottom: 3rem; | |
transition: color 0.3s ease; | |
} | |
.description { | |
color: var(--text-secondary); | |
font-size: 1.25rem; | |
line-height: 1.75; | |
max-width: 800px; | |
margin: 0 auto; | |
text-align: center; | |
transition: color 0.3s ease; | |
} | |
.highlight-question { | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
display: block; | |
margin-top: 1rem; | |
font-size: 1.5rem; | |
font-weight: 500; | |
} | |
.metrics-grid { | |
display: grid; | |
grid-template-columns: repeat(3, 1fr); | |
gap: 1.5rem; | |
margin-top: 4rem; | |
} | |
.metric-card { | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 1rem; | |
padding: 2rem; | |
transition: all 0.3s ease; | |
align-items: center; | |
} | |
.metric-card:hover { | |
transform: translateY(-5px); | |
border-color: var(--border-hover); | |
box-shadow: 0 4px 20px var(--shadow-color); | |
} | |
.metric-number { | |
font-size: 4rem; | |
font-weight: 800; | |
margin-bottom: 1rem; | |
} | |
.metric-blue { | |
background: var(--blue-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-purple { | |
background: var(--purple-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-pink { | |
background: var(--pink-gradient); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
} | |
.metric-label { | |
color: var(--text-secondary); | |
font-size: 1.5rem; | |
margin-bottom: 1.5rem; | |
transition: color 0.3s ease; | |
} | |
.metric-detail { | |
font-size: 1.125rem; | |
line-height: 1.75; | |
margin-top: 0.5rem; | |
transition: color 0.3s ease; | |
} | |
.metric-detail.primary { | |
color: var(--accent-color); | |
} | |
.metric-detail.secondary { | |
color: var(--text-secondary); | |
} | |
.actions { | |
display: flex; | |
gap: 1rem; | |
justify-content: center; | |
margin-top: 3rem; | |
} | |
.action-button { | |
display: flex; | |
align-items: center; | |
gap: 0.5rem; | |
padding: 0.75rem 1.5rem; | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 100px; | |
color: var(--text-primary) !important; | |
text-decoration: none !important; | |
font-size: 0.95rem; | |
transition: all 0.3s ease; | |
} | |
.action-button:hover { | |
transform: translateY(-2px); | |
border-color: var(--accent-color); | |
background: var(--accent-bg); | |
} | |
@media (max-width: 768px) { | |
.title-gradient, .subtitle-white { | |
font-size: 3rem; | |
} | |
.metrics-grid { | |
grid-template-columns: 1fr; | |
} | |
} | |
</style> | |
<div class="header-wrapper"> | |
<div class="header-content"> | |
<div class="title-section"> | |
<div class="title-gradient">Agent Leaderboard</div> | |
<div class="description"> | |
GenAI is evolving rapidly, with developers building exciting, high ROI agents. | |
We built this leaderboard to answer one simple question: | |
<div class="highlight-question"> | |
"How do top LLMs perform in real-world agentic scenarios?" | |
</div> | |
</div> | |
</div> | |
<div class="actions"> | |
<a href="https://galileo.ai/blog/agent-leaderboard" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/> | |
<line x1="8" y1="12" x2="16" y2="12"/> | |
</svg> | |
Blog | |
</a> | |
<a href="https://github.com/rungalileo/agent-leaderboard" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/> | |
</svg> | |
GitHub | |
</a> | |
<a href="https://huggingface.co./datasets/galileo-ai/agent-leaderboard" class="action-button"> | |
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="7 10 12 15 17 10"/> | |
<line x1="12" y1="15" x2="12" y2="3"/> | |
</svg> | |
Dataset | |
</a> | |
</div> | |
</div> | |
</div> | |
""" | |
) | |
CARDS = """ <div class="metrics-grid"> | |
<div class="metric-card"> | |
<div class="metric-number metric-blue">17</div> | |
<div class="metric-label">Total Models</div> | |
<div class="metric-detail primary">12 Private</div> | |
<div class="metric-detail primary">5 Open Source</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-purple">14</div> | |
<div class="metric-label">Evaluation Datasets</div> | |
<div class="metric-detail primary">Multi-Domain Testing</div> | |
<div class="metric-detail primary">Real-world use cases</div> | |
</div> | |
<div class="metric-card"> | |
<div class="metric-number metric-pink">TSQ</div> | |
<div class="metric-label">Evaluation Metric</div> | |
<div class="metric-detail primary">Tool Selection Quality</div> | |
<div class="metric-detail primary">GPT-4o Based Judge</div> | |
</div> | |
</div>""" | |
METHODOLOGY = """ | |
<style> | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--bg-tertiary: rgba(30, 30, 45, 0.95); | |
--text-primary: #ffffff; | |
--text-secondary: #94A3B8; | |
--text-tertiary: #E2E8F0; | |
--border-primary: rgba(31, 41, 55, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-blue: #60A5FA; | |
--accent-purple: #A78BFA; | |
--accent-pink: #F472B6; | |
--card-hover-bg: rgba(79, 70, 229, 0.1); | |
--shadow-color: rgba(79, 70, 229, 0.1); | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--bg-tertiary: rgba(249, 250, 251, 0.95); | |
--text-primary: #111827; | |
--text-secondary: #4B5563; | |
--text-tertiary: #6B7280; | |
--border-primary: rgba(209, 213, 219, 0.5); | |
--border-hover: rgba(79, 70, 229, 0.4); | |
--accent-blue: #3B82F6; | |
--accent-purple: #8B5CF6; | |
--accent-pink: #EC4899; | |
--card-hover-bg: rgba(243, 244, 246, 0.8); | |
--shadow-color: rgba(0, 0, 0, 0.1); | |
} | |
} | |
/* [Previous CSS remains the same until features-grid] */ | |
/* Features Grid Section */ | |
.features-grid { | |
display: grid; | |
grid-template-columns: repeat(3, 1fr); | |
gap: 1.5rem; | |
width: 100%; | |
padding: 2rem 0; | |
} | |
.dataset-table { | |
width: 100%; | |
border-collapse: separate; | |
border-spacing: 0; | |
margin: 2rem 0; | |
background: var(--bg-tertiary); | |
border-radius: 1rem; | |
overflow: hidden; | |
box-shadow: 0 4px 20px var(--shadow-color); | |
} | |
.dataset-table thead { | |
background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple)); | |
} | |
.dataset-table th { | |
padding: 1.25rem 1rem; | |
text-align: left; | |
color: white; | |
font-weight: 600; | |
font-size: 1rem; | |
} | |
.dataset-table td { | |
padding: 1rem; | |
border-bottom: 1px solid var(--border-primary); | |
color: var(--text-secondary); | |
transition: all 0.2s ease; | |
} | |
.dataset-table tbody tr:hover td { | |
background: var(--card-hover-bg); | |
color: var(--text-primary); | |
} | |
.dataset-table td[rowspan] { | |
background: var(--bg-secondary); | |
color: var(--accent-blue); | |
font-weight: 600; | |
border-right: 1px solid var(--border-primary); | |
} | |
.purpose-cell { | |
max-width: 300px; | |
line-height: 1.5; | |
} | |
.category-cell { | |
color: var(--accent-purple); | |
font-weight: 500; | |
} | |
.dataset-name { | |
font-family: monospace; | |
color: var(--accent-pink); | |
font-size: 0.9rem; | |
} | |
.code-intro { | |
color: var(--text-secondary); | |
font-size: 1.1rem; | |
margin-bottom: 1.5rem; | |
line-height: 1.6; | |
} | |
.section-divider { | |
margin: 1rem 0; | |
border-top: 1px solid var(--border-color); | |
opacity: 0.3; | |
} | |
.key-insights thead tr { | |
background: linear-gradient(90deg, #60A5FA, #818CF8); | |
} | |
.key-insights td:first-child { | |
color: var(--accent-blue); | |
background: var(--bg-primary); | |
} | |
.key-insights td:last-child { | |
background: var(--bg-primary); | |
} | |
.key-insights td { | |
padding: 1rem; | |
border-bottom: 1px solid rgba(31, 41, 55, 0.5); | |
} | |
.highlight { | |
color: var(--accent-blue); | |
font-weight: 600; | |
display: inline-flex; | |
align-items: center; | |
} | |
.highlight::after { | |
content: ":"; | |
margin-right: 0.5rem; /* Adds space after the colon */ | |
} | |
@media (prefers-color-scheme: dark) { | |
:root { | |
--bg-primary: #0B0B19; | |
--bg-secondary: rgba(19, 19, 37, 0.4); | |
--text-primary: #ffffff; | |
--text-secondary: #94A3B8; | |
--border-color: rgba(31, 41, 55, 0.5); | |
--accent-blue: #60A5FA; | |
--accent-purple: #A78BFA; | |
--code-bg: #1E1E2E; | |
--code-line-highlight: rgba(96, 165, 250, 0.1); | |
--bullet-color: #60A5FA; | |
--table-header: #1a1b1e; | |
--table-border: #2d2e32; | |
--table-hover: #2d2e32; | |
} | |
} | |
@media (prefers-color-scheme: light) { | |
:root { | |
--bg-primary: #ffffff; | |
--bg-secondary: rgba(243, 244, 246, 0.4); | |
--text-primary: #111827; | |
--text-secondary: #4B5563; | |
--border-color: rgba(209, 213, 219, 0.5); | |
--accent-blue: #3B82F6; | |
--accent-purple: #8B5CF6; | |
--code-bg: #F8FAFC; | |
--code-line-highlight: rgba(59, 130, 246, 0.1); | |
--bullet-color: #3B82F6; | |
--table-header: #F8FAFC; | |
--table-border: #E5E7EB; | |
--table-hover: #F3F4F6; | |
} | |
} | |
.methodology-content { | |
max-width: 1200px; | |
margin: 0 auto; | |
padding: 2rem; | |
color: var(--text-secondary); | |
line-height: 1.7; | |
font-size: 1rem; | |
} | |
.section-title { | |
font-size: 2.5rem; | |
font-weight: 700; | |
margin: 3rem 0 1.5rem; | |
color: var(--text-primary); | |
background: linear-gradient(to right, var(--accent-blue), var(--accent-purple)); | |
-webkit-background-clip: text; | |
-webkit-text-fill-color: transparent; | |
letter-spacing: -0.02em; | |
} | |
.subsection-title { | |
font-size: 1.75rem; | |
font-weight: 600; | |
margin: 2rem 0 1rem; | |
color: var(--text-primary); | |
letter-spacing: -0.01em; | |
} | |
.content-block { | |
background: var(--bg-secondary); | |
border: 1px solid var(--border-color); | |
border-radius: 12px; | |
padding: 1.5rem; | |
margin: 1.5rem 0; | |
} | |
.methodology-list { | |
list-style: none !important; /* Force remove default bullets */ | |
padding: 0; | |
margin: 1rem 0; | |
} | |
.methodology-list li { | |
padding-left: 2rem; | |
position: relative; | |
margin: 1rem 0; | |
color: var(--text-secondary); | |
display: flex; /* Add flex display */ | |
align-items: flex-start; /* Align items to top */ | |
} | |
.methodology-list li::before { | |
content: ''; | |
position: absolute; | |
left: 0; | |
top: 0.75rem; | |
width: 8px; | |
height: 8px; | |
background: var(--bullet-color); | |
border-radius: 50%; | |
box-shadow: 0 0 0 2px rgba(96, 165, 250, 0.2); | |
flex-shrink: 0; /* Prevent bullet from shrinking */ | |
} | |
/* Additional fix for nested list items */ | |
.methodology-list li > * { | |
list-style: none !important; | |
margin-left: 0; | |
padding-left: 0; | |
} | |
.code-block { | |
background: var(--code-bg); | |
border-radius: 12px; | |
padding: 1.5rem; | |
margin: 1.5rem 0; | |
font-family: 'SF Mono', 'Menlo', monospace; | |
font-size: 0.9rem; | |
overflow-x: auto; | |
border: 1px solid var(--border-color); | |
} | |
.code-block pre { | |
margin: 0; | |
padding: 0; | |
} | |
.highlight { | |
color: var(--accent-blue); | |
font-weight: 600; | |
} | |
/* Dataset Table Styling */ | |
.dataset-table { | |
width: 100%; | |
border-collapse: collapse; | |
margin: 1.5rem 0; | |
background: var(--bg-secondary); | |
border-radius: 12px; | |
overflow: hidden; | |
} | |
.dataset-table th { | |
background: var(--table-header); | |
padding: 1rem; | |
text-align: left; | |
font-weight: 600; | |
color: var(--text-primary); | |
border-bottom: 2px solid var(--table-border); | |
} | |
.dataset-table td { | |
padding: 1rem; | |
border-bottom: 1px solid var(--table-border); | |
color: var(--text-secondary); | |
} | |
.dataset-table tbody tr:hover { | |
background: var(--table-hover); | |
} | |
.dataset-table td:first-child { | |
font-weight: 500; | |
} | |
</style> | |
<!-- Methodology Section --> | |
<h1 class="section-title">Methodology</h1> | |
<p>Our evaluation process follows a systematic approach to ensure comprehensive and fair assessment of AI agents. We evaluate language models' ability to effectively use tools | |
in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge | |
cases that challenge real-world applicability.</p> | |
<ul class="methodology-list"> | |
<li><span class="highlight">Model Selection</span>We begin by curating a diverse set of leading language models, including both proprietary and open-source implementations.</li> | |
<li><span class="highlight">Agent Configuration</span>Each model is configured as an agent using a standardized system prompt and given access to a consistent set of tools.</li> | |
<li><span class="highlight">Metric Definition</span> <a href="https://docs.galileo.ai/galileo/gen-ai-studio-products/galileo-guardrail-metrics/tool-selection-quality#tool-selection-quality">Tool Selection Quality (TSQ)</a></li> | |
<li><span class="highlight">Scoring System</span>The final performance score is calculated as an equally weighted average across all datasets.</li> | |
<li><span class="highlight">Dataset Curation</span>We strategically sampled from established benchmarking datasets. See later section for more info.</li> | |
<div class="methodology-section"> | |
<div class="table-container"> | |
<table class="dataset-table"> | |
<thead> | |
<tr> | |
<th>Dataset</th> | |
<th>Domains</th> | |
<th>Link</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr> | |
<td>BFCL</td> | |
<td>Mathematics, Entertainment, Education, and Academic Domains</td> | |
<td><a href="https://gorilla.cs.berkeley.edu/leaderboard.html">View Dataset</a></td> | |
</tr> | |
<tr> | |
<td>τ-bench</td> | |
<td>Retail and Airline Industry Scenarios</td> | |
<td><a href="https://github.com/sierra-research/tau-bench">View Dataset</a></td> | |
</tr> | |
<tr> | |
<td>xLAM</td> | |
<td>Cross-domain Data Generation (21 Domains)</td> | |
<td><a href="https://www.salesforce.com/blog/xlam-large-action-models/">View Dataset</a></td> | |
</tr> | |
<tr> | |
<td>ToolACE</td> | |
<td>API Interactions across 390 Domains</td> | |
<td><a href="https://arxiv.org/abs/2409.00920">View Dataset</a></td> | |
</tr> | |
</tbody> | |
</table> | |
</div> | |
</div> | |
</ul> | |
<div class="methodology-section"> | |
<h1 class="section-title">Key Insights</h1> | |
<div class="table-container"> | |
<table class="dataset-table key-insights"> | |
<thead> | |
<tr> | |
<th>Category</th> | |
<th>Finding</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr> | |
<td>Performance Champion</td> | |
<td>Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.</td> | |
</tr> | |
<tr> | |
<td>Price-Performance Paradox</td> | |
<td>Top 3 models span 10x price difference yet only 4% performance gap, challenging pricing assumptions</td> | |
</tr> | |
<tr> | |
<td>Open Vs Closed Source</td> | |
<td>The new Mistral-small leads in open source models and performs similar to GPT-4o-mini at 0.83, signaling OSS maturity in tool calling</td> | |
</tr> | |
<tr> | |
<td>Reasoning Models</td> | |
<td>Although being great for reasoning, o1 and o3-mini are far from perfect scoring 0.87 and 0.84 respectively. DeepSeek V3 and R1 were excluded from rankings due to limited function support</td> | |
</tr> | |
<tr> | |
<td>Tool Miss Detection</td> | |
<td>Low dataset averages of 0.60(tool_miss) and 0.73(miss_func) reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks</td> | |
</tr> | |
<tr> | |
<td>Architecture Trade-offs</td> | |
<td>Long context vs parallel execution shows architectural limits: O1 leads context (0.98) but fails parallel tasks (0.43), while GPT-4o shows opposite pattern</td> | |
</tr> | |
</tbody> | |
</table> | |
</div> | |
<h1 class="section-title">Development Implications</h2> | |
<div class="table-container"> | |
<table class="dataset-table key-insights"> | |
<thead> | |
<tr> | |
<th>Area</th> | |
<th>Recommendation</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr> | |
<td>Task Complexity</td> | |
<td>Simple tasks work with most models. Complex workflows requiring multiple tools need models with 0.85+ scores in composite tests</td> | |
</tr> | |
<tr> | |
<td>Error Handling</td> | |
<td>Models with low tool selection scores need guardrails. Add validation layers and structured error recovery, especially for parameter collection</td> | |
</tr> | |
<tr> | |
<td>Context Management</td> | |
<td>Long conversations require either models strong in context retention or external context storage systems</td> | |
</tr> | |
<tr> | |
<td>Reasoning Models</td> | |
<td>While o1 and o3-mini excelled in function calling, DeepSeek V3 and R1 were excluded from rankings due to limited function support</td> | |
</tr> | |
<tr> | |
<td>Safety Controls</td> | |
<td>Add strict tool access controls for models weak in irrelevance detection. Include validation layers for inconsistent performers</td> | |
</tr> | |
<tr> | |
<td>Open Vs Closed Source</td> | |
<td>Private models lead in complex tasks, but open-source options work well for basic operations. Choose based on your scaling needs</td> | |
</tr> | |
</tbody> | |
</table> | |
</div> | |
<div class="section-divider"></div> | |
<h1 class="section-title">What Makes Tool Selection Hard?</h1> | |
<div class="section-divider"></div> | |
<h2 class="subsection-title">Scenario Recognition</h2> | |
<div class="explanation-block"> | |
<p>When an agent encounters a query, it must first determine if tool usage is warranted. Information may already exist in the conversation history, making tool calls redundant. Alternatively, available tools might be insufficient or irrelevant to the task, requiring the agent to acknowledge limitations rather than force inappropriate tool usage.</p> | |
</div> | |
<div class="section-divider"></div> | |
<h2 class="subsection-title">Tool Selection Dynamics</h2> | |
<div class="explanation-block"> | |
<p>Tool selection isn't binary—it involves both precision and recall. An agent might correctly identify one necessary tool while missing others (recall issue) or select appropriate tools alongside unnecessary ones (precision issue). While suboptimal, these scenarios represent different severity levels of selection errors.</p> | |
</div> | |
<div class="section-divider"></div> | |
<h2 class="subsection-title">Parameter Handling</h2> | |
<div class="explanation-block"> | |
<p>Even with correct tool selection, argument handling introduces additional complexity. Agents must:</p> | |
<ul class="methodology-list"> | |
<li>Provide all required parameters with correct naming</li> | |
<li>Handle optional parameters appropriately</li> | |
<li>Maintain parameter value accuracy</li> | |
<li>Format arguments according to tool specifications</li> | |
</ul> | |
</div> | |
<div class="section-divider"></div> | |
<h2 class="subsection-title">Sequential Decision Making</h2> | |
<div class="explanation-block"> | |
<p>Multi-step tasks require agents to:</p> | |
<ul class="methodology-list"> | |
<li>Determine optimal tool calling sequence</li> | |
<li>Handle interdependencies between tool calls</li> | |
<li>Maintain context across multiple operations</li> | |
<li>Adapt to partial results or failures</li> | |
</ul> | |
</div> | |
<div class="section-divider"></div> | |
<h1 class="section-title">How Do We Measure Agent's Performance?</h1> | |
<p class="code-intro">We developed the Tool Selection Quality metric to assess agents' tool call performance, evaluating tool selection accuracy and effectiveness of parameter usage. This is an example code for evaluating an LLM with a dataset with Galileo's Tool Selection Quality.</p> | |
<div class="code-block"> | |
<pre> | |
import promptquality as pq | |
df = pd.read_parquet(file_path) | |
chainpoll_tool_selection_scorer = pq.CustomizedChainPollScorer( | |
scorer_name=pq.CustomizedScorerName.tool_selection_quality, | |
model_alias=pq.Models.gpt_4o, | |
) | |
evaluate_handler = pq.GalileoPromptCallback( | |
project_name=project_name, | |
run_name=run_name, | |
scorers=[chainpoll_tool_selection_scorer], | |
) | |
llm = llm_handler.get_llm(model, temperature=0.0, max_tokens=4000) # llm_handler is a custom handler for LLMs | |
system_msg = { | |
"role": "system", | |
"content": 'Your job is to use the given tools to answer the query of human. If there is no relevant tool then reply with "I cannot answer the question with given tools". If tool is available but sufficient information is not available, then ask human to get the same. You can call as many tools as you want. Use multiple tools if needed. If the tools need to be called in a sequence then just call the first tool.', | |
} | |
for row in df.itertuples(): | |
chain = llm.bind_tools(tools) # attach the tools | |
outputs.append( | |
chain.invoke( | |
[system_msg, *row.conversation], | |
config=dict(callbacks=[evaluate_handler]) | |
) | |
) | |
evaluate_handler.finish() | |
</pre> | |
</div> | |
</div> | |
<h1 class="section-title">Dataset Structure</h2> | |
<div class="table-container"> | |
<table class="dataset-table"> | |
<thead> | |
<tr> | |
<th>Type</th> | |
<th>Samples</th> | |
<th>Category</th> | |
<th>Dataset Name</th> | |
<th>Purpose</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr> | |
<td rowspan="5">Single-Turn</td> | |
<td>100 + 100</td> | |
<td class="category-cell">Single Function Call</td> | |
<td class="dataset-name">xlam_single_tool_single_call, xlam_multiple_tool_single_call</td> | |
<td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td> | |
</tr> | |
<tr> | |
<td>200 + 50</td> | |
<td class="category-cell">Multiple Function Call</td> | |
<td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td> | |
<td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Irrelevant Query</td> | |
<td class="dataset-name">BFCL_v3_irrelevance</td> | |
<td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Long Context</td> | |
<td class="dataset-name">tau_long_context</td> | |
<td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Missing Function</td> | |
<td class="dataset-name">xlam_tool_miss</td> | |
<td class="purpose-cell">Tests graceful handling of unavailable tools</td> | |
</tr> | |
<tr> | |
<td rowspan="5">Multi-Turn</td> | |
<td>50 + 30</td> | |
<td class="category-cell">Single Function Call</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td> | |
<td class="purpose-cell">Tests basic conversational function calling abilities</td> | |
</tr> | |
<tr> | |
<td>50</td> | |
<td class="category-cell">Multiple Function Call</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td> | |
<td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Missing Function</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_miss_func</td> | |
<td class="purpose-cell">Tests graceful handling of unavailable tools</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Missing Parameters</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_miss_param</td> | |
<td class="purpose-cell">Assesses parameter collection and handling incomplete information</td> | |
</tr> | |
<tr> | |
<td>100</td> | |
<td class="category-cell">Composite</td> | |
<td class="dataset-name">BFCL_v3_multi_turn_composite</td> | |
<td class="purpose-cell">Tests overall robustness in complex scenarios</td> | |
</tr> | |
</tbody> | |
</table> | |
</div> | |
<div class="section-divider"></div> | |
<h2 class="section-title">Citation</h2> | |
<div class="bibtex-citation" style="font-family: monospace; white-space: pre; padding: 1em; background-color: rgba(128, 128, 128, 0.1); border: 1px solid rgba(128, 128, 128, 0.2); border-radius: 4px; color: currentColor;">@misc{agent-leaderboard, | |
author = {Pratik Bhavsar}, | |
title = {Agent Leaderboard}, | |
year = {2025}, | |
publisher = {Galileo.ai}, | |
howpublished = {\\url{https://huggingface.co./spaces/galileo-ai/agent-leaderboard}} | |
}</div> | |
<!-- Features Grid Section --> | |
<div class="features-grid"> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">Make Better Decisions</h3> | |
<ul class="feature-list"> | |
<li>Cost-effectiveness analysis</li> | |
<li>Business impact metrics</li> | |
<li>Vendor strategy insights</li> | |
</ul> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">360° Domain Evaluation</h3> | |
<ul class="feature-list"> | |
<li>Multi-domain evaluation</li> | |
<li>Real-world use cases</li> | |
<li>Edge case evaluation</li> | |
</ul> | |
</div> | |
<div class="feature-card"> | |
<div class="feature-icon"> | |
<svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24"> | |
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/> | |
</svg> | |
</div> | |
<h3 class="feature-title">Updated Periodically</h3> | |
<ul class="feature-list"> | |
<li>12 private models evaluated</li> | |
<li>5 open source models included</li> | |
<li>Monthly model additions</li> | |
</ul> | |
</div> | |
</div> | |
""" | |