import pandas as pd DATASETS = [ "BFCL_v3_irrelevance", "BFCL_v3_multi_turn_base_multi_func_call", "BFCL_v3_multi_turn_base_single_func_call", "BFCL_v3_multi_turn_composite", "BFCL_v3_multi_turn_long_context", "BFCL_v3_multi_turn_miss_func", "BFCL_v3_multi_turn_miss_param", "tau_long_context", "toolace_single_func_call_1", "toolace_single_func_call_2", "xlam_multiple_tool_multiple_call", "xlam_multiple_tool_single_call", "xlam_single_tool_multiple_call", "xlam_single_tool_single_call", "xlam_tool_miss", ] SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] def load_data(): """Load and preprocess the data.""" df = pd.read_csv("results.csv").dropna() # Add combined I/O cost column with 3:1 ratio df["IO Cost"] = ( df["Input cost per million token"] * 0.75 + df["Output cost per million token"] * 0.25 ) return df # categories.py CATEGORIES = { "Overall": ["Model Avg"], "Overall single turn": ["single turn perf"], "Overall multi turn": ["multi turn perf"], "Single func call": [ "xlam_single_tool_single_call", "xlam_multiple_tool_single_call", ], "Multiple func call": [ "xlam_multiple_tool_multiple_call", "xlam_single_tool_multiple_call", "BFCL_v3_multi_turn_base_multi_func_call", ], "Irrelevant query": ["BFCL_v3_irrelevance"], "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], "Missing params": ["BFCL_v3_multi_turn_miss_param"], "Composite": ["BFCL_v3_multi_turn_composite"], } chat_css = """ /* Container styles */ .container { display: flex; gap: 1.5rem; height: calc(100vh - 100px); padding: 1rem; } /* Chat panel styles */ .chat-panel { flex: 2; background: #1a1f2c; border-radius: 1rem; padding: 1rem; overflow-y: auto; max-height: calc(100vh - 120px); } /* Message styles */ .message { padding: 1.2rem; margin: 0.8rem; border-radius: 1rem; font-family: monospace; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .system { background: linear-gradient(135deg, #8e44ad, #9b59b6); } .user { background: linear-gradient(135deg, #2c3e50, #3498db); margin-left: 2rem; } .assistant { background: linear-gradient(135deg, #27ae60, #2ecc71); margin-right: 2rem; } .role-badge { display: inline-block; padding: 0.3rem 0.8rem; border-radius: 0.5rem; font-weight: bold; margin-bottom: 0.8rem; font-size: 0.9rem; text-transform: uppercase; letter-spacing: 0.05em; } .system-role { background-color: #8e44ad; color: white; } .user-role { background-color: #3498db; color: white; } .assistant-role { background-color: #27ae60; color: white; } .content { white-space: pre-wrap; word-break: break-word; color: #f5f6fa; line-height: 1.5; } /* Metrics panel styles */ .metrics-panel { flex: 1; display: flex; flex-direction: column; gap: 2rem; padding: 1.5rem; background: #1a1f2c; border-radius: 1rem; } .metric-section { background: #1E293B; padding: 1.5rem; border-radius: 1rem; } .score-section { text-align: center; } .score-display { font-size: 3rem; font-weight: bold; color: #4ADE80; line-height: 1; margin: 0.5rem 0; } .explanation-text { color: #E2E8F0; line-height: 1.6; font-size: 0.95rem; } /* Tool info panel styles */ .tool-info-panel { background: #1a1f2c; padding: 1.5rem; border-radius: 1rem; color: #f5f6fa; } .tool-section { margin-bottom: 1.5rem; } .tool-name { font-size: 1.2rem; color: #4ADE80; font-weight: bold; margin-bottom: 0.5rem; } .tool-description { color: #E2E8F0; line-height: 1.6; margin-bottom: 1rem; } .tool-parameters .parameter { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; } .param-name { color: #63B3ED; font-weight: bold; margin-right: 0.5rem; } .tool-examples .example { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; font-family: monospace; } /* Custom scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: rgba(255, 255, 255, 0.1); border-radius: 4px; } ::-webkit-scrollbar-thumb { background: linear-gradient(45deg, #3498db, #2ecc71); border-radius: 4px; } /* Title styles */ .title { color: #63B3ED; font-size: 2rem; font-weight: bold; text-align: center; margin-bottom: 1.5rem; padding: 1rem; } /* Headers */ h3 { color: #63B3ED; margin: 0 0 1rem 0; font-size: 1.1rem; font-weight: 500; letter-spacing: 0.05em; } """ COMMON = """ """ DESCRIPTION_HTML = """
🎯 Purpose Latest Update: Feb 2025

This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.

πŸ” What We Evaluate
πŸ”„ Single/Multi-turn Interactions
🧩 Function Composition
⚑ Error Handling
πŸ“Š Key Results
βœ… Tool Selection Quality
πŸ’° Open Vs Closed Source
βš–οΈ Overall Effectiveness
""" HEADER_CONTENT = ( COMMON + """
Agent Leaderboard
GenAI is evolving rapidly, with developers building exciting, high ROI agents. We built this leaderboard to answer one simple question:
"How do top LLMs perform in real-world agentic scenarios?"
""" ) CARDS = """
17
Total Models
12 Private
5 Open Source
14
Evaluation Datasets
Multi-Domain Testing
Real-world use cases
TSQ
Evaluation Metric
Tool Selection Quality
GPT-4o Based Judge
""" METHODOLOGY = """

Methodology

Our evaluation process follows a systematic approach to ensure comprehensive and fair assessment of AI agents. We evaluate language models' ability to effectively use tools in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.

Key Insights

Category Finding
Performance Champion Gemini-2.0-flash dominates with 0.938 score at a very affordable cost, excelling in both complex tasks and safety features.
Price-Performance Paradox Top 3 models span 10x price difference yet only 4% performance gap, challenging pricing assumptions
Open Vs Closed Source The new Mistral-small leads in open source models and performs similar to GPT-4o-mini at 0.83, signaling OSS maturity in tool calling
Reasoning Models Although being great for reasoning, o1 and o3-mini are far from perfect scoring 0.87 and 0.84 respectively. DeepSeek V3 and R1 were excluded from rankings due to limited function support
Tool Miss Detection Low dataset averages of 0.60(tool_miss) and 0.73(miss_func) reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks
Architecture Trade-offs Long context vs parallel execution shows architectural limits: O1 leads context (0.98) but fails parallel tasks (0.43), while GPT-4o shows opposite pattern

Development Implications

Area Recommendation
Task Complexity Simple tasks work with most models. Complex workflows requiring multiple tools need models with 0.85+ scores in composite tests
Error Handling Models with low tool selection scores need guardrails. Add validation layers and structured error recovery, especially for parameter collection
Context Management Long conversations require either models strong in context retention or external context storage systems
Reasoning Models While o1 and o3-mini excelled in function calling, DeepSeek V3 and R1 were excluded from rankings due to limited function support
Safety Controls Add strict tool access controls for models weak in irrelevance detection. Include validation layers for inconsistent performers
Open Vs Closed Source Private models lead in complex tasks, but open-source options work well for basic operations. Choose based on your scaling needs

What Makes Tool Selection Hard?

Scenario Recognition

When an agent encounters a query, it must first determine if tool usage is warranted. Information may already exist in the conversation history, making tool calls redundant. Alternatively, available tools might be insufficient or irrelevant to the task, requiring the agent to acknowledge limitations rather than force inappropriate tool usage.

Tool Selection Dynamics

Tool selection isn't binaryβ€”it involves both precision and recall. An agent might correctly identify one necessary tool while missing others (recall issue) or select appropriate tools alongside unnecessary ones (precision issue). While suboptimal, these scenarios represent different severity levels of selection errors.

Parameter Handling

Even with correct tool selection, argument handling introduces additional complexity. Agents must:

Sequential Decision Making

Multi-step tasks require agents to:

How Do We Measure Agent's Performance?

We developed the Tool Selection Quality metric to assess agents' tool call performance, evaluating tool selection accuracy and effectiveness of parameter usage. This is an example code for evaluating an LLM with a dataset with Galileo's Tool Selection Quality.

import promptquality as pq

df = pd.read_parquet(file_path)

chainpoll_tool_selection_scorer = pq.CustomizedChainPollScorer(
                scorer_name=pq.CustomizedScorerName.tool_selection_quality,
                model_alias=pq.Models.gpt_4o,
            )

evaluate_handler = pq.GalileoPromptCallback(
        project_name=project_name,
        run_name=run_name,
        scorers=[chainpoll_tool_selection_scorer],
    )

llm = llm_handler.get_llm(model, temperature=0.0, max_tokens=4000) # llm_handler is a custom handler for LLMs

system_msg = {
            "role": "system",
            "content": 'Your job is to use the given tools to answer the query of human. If there is no relevant tool then reply with "I cannot answer the question with given tools". If tool is available but sufficient information is not available, then ask human to get the same. You can call as many tools as you want. Use multiple tools if needed. If the tools need to be called in a sequence then just call the first tool.',
        }

for row in df.itertuples():
    chain = llm.bind_tools(tools) # attach the tools
    outputs.append(
            chain.invoke(
                [system_msg, *row.conversation], 
                config=dict(callbacks=[evaluate_handler])
            )
        )

evaluate_handler.finish()

            

Dataset Structure

Type Samples Category Dataset Name Purpose
Single-Turn 100 + 100 Single Function Call xlam_single_tool_single_call, xlam_multiple_tool_single_call Evaluates basic ability to read documentation and make single function calls
200 + 50 Multiple Function Call xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call Tests parallel execution and result aggregation capabilities
100 Irrelevant Query BFCL_v3_irrelevance Tests ability to recognize when available tools don't match user needs
100 Long Context tau_long_context Assesses handling of extended interactions and complex instructions
100 Missing Function xlam_tool_miss Tests graceful handling of unavailable tools
Multi-Turn 50 + 30 Single Function Call BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call Tests basic conversational function calling abilities
50 Multiple Function Call BFCL_v3_multi_turn_base_multi_func_call Evaluates handling of multiple function calls in conversation
100 Missing Function BFCL_v3_multi_turn_miss_func Tests graceful handling of unavailable tools
100 Missing Parameters BFCL_v3_multi_turn_miss_param Assesses parameter collection and handling incomplete information
100 Composite BFCL_v3_multi_turn_composite Tests overall robustness in complex scenarios

Citation

@misc{agent-leaderboard, author = {Pratik Bhavsar}, title = {Agent Leaderboard}, year = {2025}, publisher = {Galileo.ai}, howpublished = {\\url{https://huggingface.co./spaces/galileo-ai/agent-leaderboard}} }

Make Better Decisions

360Β° Domain Evaluation

Updated Periodically

"""