Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

Pratik Bhavsar commited on 1 day ago

Commit

b0ce6f5

1 Parent(s): df66c39

added data exploration

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +7 -8
chat.py +266 -115
data_loader.py +19 -5
get_exp_data.ipynb +167 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet +0 -0
output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet +0 -0
output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet +0 -0
output/claude-3-5-haiku-20241022/tau_long_context.parquet +0 -0
output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet +0 -0
output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet +0 -0
output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet +0 -0
output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet +0 -0
output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet +0 -0
output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet +0 -0
output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet +0 -0
output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet +0 -0

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import promptquality as pq
 from dotenv import load_dotenv
 load_dotenv()
-pq.login("https://console.demo.rungalileo.io")
 from data_loader import (
     load_data,
@@ -36,9 +35,9 @@ def create_app():
             mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
-            # exp_outputs = create_exploration_tab(
-            #     df, MODELS, DATASETS, SCORES, HEADER_CONTENT
-            # )
         # Initial loads
         app.load(
@@ -55,10 +54,10 @@ def create_app():
             outputs=[mc_info, mc_plot],
         )
-        # app.load(
-        #     fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], [], 0),
-        #     outputs=exp_outputs,
-        # )
     return app

 from dotenv import load_dotenv
 load_dotenv()
 from data_loader import (
     load_data,
             mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT)
+            exp_outputs = create_exploration_tab(
+                df, MODELS, DATASETS, SCORES, HEADER_CONTENT
+            )
         # Initial loads
         app.load(
             outputs=[mc_info, mc_plot],
         )
+        app.load(
+            fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], 0, 1, 0),
+            outputs=exp_outputs,
+        )
     return app

chat.py CHANGED Viewed

@@ -1,199 +1,350 @@
-# chat.py
 import gradio as gr
-import json
 import pandas as pd
-import numpy as np
-from functools import lru_cache
-import promptquality as pq
-project_name = "agent-lb-v1"
-PROJECT_ID = pq.get_project_from_name(project_name).id
-@lru_cache(maxsize=1000)
-def get_model_score_for_dataset(model, dataset):
-    print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
-    run_name = f"{model} {dataset}"
-    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
-    rows = pq.get_rows(
-        project_id=PROJECT_ID,
-        run_id=run_id,
-        task_type=None,
-        config=None,
-        starting_token=0,
-        limit=1000,
-    )
-    rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
-    scores = [
-        round(d.metrics.tool_selection_quality, 2)
-        for d, rationale in zip(rows, rationales)
-        if rationale
-    ]
-    explanations = [
-        d.metrics.tool_selection_quality_explanation
-        for d, rationale in zip(rows, rationales)
-        if rationale
     ]
-    rationales = [r for r in rationales if r]
-    mean_score = round(np.mean(scores), 2)
-    return {
-        "mean_score": mean_score,
-        "scores": scores,
-        "rationales": rationales,
-        "explanations": explanations,
-    }
-def get_updated_df(df, data):
-    df["rationale"] = data["rationales"]
-    df["explanation"] = data["explanations"]
-    df["score"] = data["scores"]
-    return df
 def get_chat_and_score_df(model, dataset):
-    data = get_model_score_for_dataset(model, dataset)
     df = pd.read_parquet(f"datasets/{dataset}.parquet")
-    df = get_updated_df(df, data)
     return df
-def format_chat_message(role, content):
-    """Format individual chat messages with proper styling."""
     role_style = role.lower()
     return f"""
-    <div class="message {role_style}">
-        <div class="role-badge {role_style}-role">{role}</div>
-        <div class="content">{content}</div>
     </div>
     """
 def format_tool_info(tools):
-    """Format tool information with proper styling."""
     if isinstance(tools, str):
         try:
             tools = json.loads(tools)
         except:
-            return "<div>No tool information available</div>"
     if not tools:
-        return "<div>No tool information available</div>"
     tool_html = ""
     for tool in tools:
         tool_html += f"""
-        <div class="tool-section">
-            <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
-            <div class="tool-description">{tool.get('description', 'No description available')}</div>
-            <div class="tool-parameters">
-                {format_parameters(tool.get('parameters', {}))}
             </div>
         </div>
         """
-    return f'<div class="tool-info-panel">{tool_html}</div>'
 def format_parameters(parameters):
     if not parameters:
-        return "<div>No parameters</div>"
     params_html = ""
     for name, desc in parameters.items():
         params_html += f"""
-        <div class="parameter">
-            <span class="param-name">{name}:</span> {desc}
         </div>
         """
     return params_html
 def format_metrics(score, rationale, explanation):
-    """Format metrics display with proper styling."""
     return f"""
-    <div class="metrics-panel">
-        <div class="metric-section">
-            <h3>Score</h3>
-            <div class="score-display">{score:.2f}</div>
         </div>
-        <div class="metric-section">
-            <h3>Rationale</h3>
-            <div class="explanation-text">{rationale}</div>
         </div>
-        <div class="metric-section">
-            <h3>Explanation</h3>
-            <div class="explanation-text">{explanation}</div>
         </div>
     </div>
     """
 def update_chat_display(df, index):
-    """Update the chat visualization for a specific index."""
     if df is None or df.empty or index >= len(df):
         return (
-            "<div>No data available</div>",
-            "<div>No metrics available</div>",
-            "<div>No tool information available</div>",
         )
     row = df.iloc[index]
-    # Format chat messages
     messages = json.loads(row["conversation"])
     chat_html = f"""
-    <div class="chat-panel">
-        {"".join([format_chat_message(msg["role"], msg["content"])
-                 for msg in messages])}
     </div>
     """
-    # Format metrics
     metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
-    # Format tool info
     tool_html = format_tool_info(row["tools_langchain"])
     return chat_html, metrics_html, tool_html
-def filter_and_update_display(model, dataset, selected_scores, current_index):
     try:
-        # Get data and filter by scores
         df_chat = get_chat_and_score_df(model, dataset)
-        if selected_scores:
-            df_chat = df_chat[df_chat["score"].isin(selected_scores)]
         if df_chat.empty:
             return (
-                "<div>No data available for selected filters</div>",
-                "<div>No metrics available</div>",
-                "<div>No tool information available</div>",
-                gr.update(maximum=0, value=0),
                 "0/0",
             )
-        # Update index bounds
         max_index = len(df_chat) - 1
         current_index = min(current_index, max_index)
-        # Get displays for current index
         chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
-        return (
-            chat_html,
-            metrics_html,
-            tool_html,
-            gr.update(maximum=max_index, value=current_index),
-            f"{current_index + 1}/{len(df_chat)}",
-        )
     except Exception as e:
-        print(f"Error in filter_and_update_display: {str(e)}")
         return (
-            f"<div>Error: {str(e)}</div>",
-            "<div>No metrics available</div>",
-            "<div>No tool information available</div>",
-            gr.update(maximum=0, value=0),
             "0/0",
         )

 import gradio as gr
 import pandas as pd
+import json
+def get_updated_df(df, df_output):
+    df = df.iloc[: len(df_output)].copy()
+    df["response"] = df_output["response"].tolist()
+    df["rationale"] = df_output["rationale"].tolist()
+    df["explanation"] = df_output["explanation"].tolist()
+    df["score"] = df_output["score"].tolist()
+    cols = [
+        "conversation",
+        "tools_langchain",
+        "n_turns",
+        "len_query",
+        "n_tools",
+        "response",
+        "rationale",
+        "explanation",
+        "score",
     ]
+    return df[cols]
 def get_chat_and_score_df(model, dataset):
+    df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
     df = pd.read_parquet(f"datasets/{dataset}.parquet")
+    df = get_updated_df(df, df_output)
     return df
+def format_chat_message(role, content, is_response=False):
+    """Format individual chat messages with alignment based on role."""
     role_style = role.lower()
+    alignment = "flex-end" if role_style == "user" else "flex-start"
+    max_width = "80%"
+    # Clean up any excessive whitespace while preserving intentional line breaks
+    cleaned_content = "\n".join(line.strip() for line in content.split("\n"))
+    background_color = (
+        "var(--response-bg)" if is_response else f"var(--message-bg-{role_style})"
+    )
     return f"""
+    <div style="
+        display: flex;
+        justify-content: {alignment};
+        margin: 0.75rem 0;">
+        <div style="
+            max-width: {max_width};
+            padding: 1rem;
+            border-radius: 12px;
+            background-color: {background_color};
+            border: 1px solid var(--border-color);
+            box-shadow: 0 1px 2px var(--shadow-color);">
+            <div style="
+                font-weight: 600;
+                color: var(--primary-text);
+                margin-bottom: 0.5rem;
+                font-size: 0.9rem;
+                text-transform: uppercase;">
+                {role + (" Response" if is_response else "")}
+            </div>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.6;
+                white-space: pre-wrap;
+                font-family: {is_response and 'monospace' or 'inherit'};
+                font-size: {is_response and '0.9rem' or 'inherit'};">
+                {cleaned_content}
+            </div>
+        </div>
     </div>
     """
+def format_response(response):
+    """Format the response data, handling both JSON and text."""
+    try:
+        # Try to parse as JSON
+        response_data = json.loads(response)
+        # Format JSON response nicely
+        formatted_response = json.dumps(response_data, indent=2)
+    except (json.JSONDecodeError, TypeError):
+        # If not JSON, use as is
+        formatted_response = str(response)
+    return formatted_response
+def parse_tool_schema(tool):
+    """Parse tool schema to extract name, description, and parameters properly."""
+    name = tool.get("title", "Unnamed Tool")
+    description = tool.get("description", "No description available")
+    parameters = {}
+    if "properties" in tool:
+        for param_name, param_data in tool["properties"].items():
+            param_desc = param_data.get("description", "No description")
+            param_type = param_data.get("type", "unknown")
+            parameters[param_name] = f"{param_desc} (Type: {param_type})"
+    return name, description, parameters
 def format_tool_info(tools):
+    """Format tool information with improved schema parsing and dark theme support."""
     if isinstance(tools, str):
         try:
             tools = json.loads(tools)
         except:
+            return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
     if not tools:
+        return '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>'
     tool_html = ""
     for tool in tools:
+        name, description, parameters = parse_tool_schema(tool)
         tool_html += f"""
+        <div style="
+            margin: 1rem 0;
+            padding: 1.5rem;
+            border-radius: 8px;
+            background-color: var(--surface-color);
+            border: 1px solid var(--border-color);">
+            <div style="
+                font-weight: 600;
+                color: var(--primary-text);
+                margin-bottom: 0.75rem;
+                font-size: 1.1rem;">
+                {name}
+            </div>
+            <div style="
+                color: var(--text-color);
+                margin-bottom: 1rem;
+                line-height: 1.5;">
+                {description}
+            </div>
+            <div style="
+                background-color: var(--surface-color-alt);
+                padding: 1rem;
+                border-radius: 4px;
+                border: 1px solid var(--border-color);">
+                {format_parameters(parameters)}
             </div>
         </div>
         """
+    return f"""
+    <div style="
+        max-height: 600px;
+        overflow-y: auto;
+        padding-right: 0.5rem;">
+        <style>
+            :root[data-theme="light"] {{
+                --surface-color: #f8f9fa;
+                --surface-color-alt: #ffffff;
+                --text-color: #202124;
+                --text-muted: #666666;
+                --primary-text: #1a73e8;
+                --border-color: #e9ecef;
+                --shadow-color: rgba(0,0,0,0.1);
+                --message-bg-user: #E5F6FD;
+                --message-bg-assistant: #F7F7F8;
+                --message-bg-system: #FFF3E0;
+                --score-high: #1a73e8;
+                --score-med: #f4b400;
+                --score-low: #ea4335;
+            }}
+            :root[data-theme="dark"] {{
+                --surface-color: #1e1e1e;
+                --surface-color-alt: #2d2d2d;
+                --text-color: #ffffff;
+                --text-muted: #a0a0a0;
+                --primary-text: #60a5fa;
+                --border-color: #404040;
+                --shadow-color: rgba(0,0,0,0.3);
+                --message-bg-user: #2d3748;
+                --message-bg-assistant: #1a1a1a;
+                --message-bg-system: #2c2516;
+                --response-bg: #2a2f3a;
+                --score-high: #60a5fa;
+                --score-med: #fbbf24;
+                --score-low: #ef4444;
+            }}
+        </style>
+        {tool_html}
+    </div>
+    """
 def format_parameters(parameters):
     if not parameters:
+        return '<div style="color: var(--text-muted);">No parameters</div>'
     params_html = ""
     for name, desc in parameters.items():
         params_html += f"""
+        <div style="margin: 0.75rem 0;">
+            <div style="
+                font-weight: 500;
+                color: var(--primary-text);
+                margin-bottom: 0.25rem;">
+                {name}
+            </div>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.4;
+                font-size: 0.95rem;">
+                {desc}
+            </div>
         </div>
         """
     return params_html
 def format_metrics(score, rationale, explanation):
+    """Format metrics display with improved dark theme support."""
+    score_color = (
+        "var(--score-high)"
+        if score >= 0.7
+        else "var(--score-med)" if score >= 0.4 else "var(--score-low)"
+    )
     return f"""
+    <div style="
+        padding: 1.5rem;
+        background-color: var(--surface-color);
+        border-radius: 8px;
+        border: 1px solid var(--border-color);
+        box-shadow: 0 2px 4px var(--shadow-color);">
+        <div style="margin-bottom: 1.5rem;">
+            <h3 style="
+                color: var(--text-color);
+                font-size: 1.1rem;
+                margin-bottom: 0.5rem;
+                font-weight: 600;">TSQ Score</h3>
+            <div style="
+                font-size: 2rem;
+                font-weight: 600;
+                color: {score_color};">
+                {score:.2f}
+            </div>
         </div>
+        <div style="margin-bottom: 1.5rem;">
+            <h3 style="
+                color: var(--text-color);
+                font-size: 1.1rem;
+                margin-bottom: 0.5rem;
+                font-weight: 600;">Rationale</h3>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.5;">
+                {rationale}
+            </div>
         </div>
+        <div>
+            <h3 style="
+                color: var(--text-color);
+                font-size: 1.1rem;
+                margin-bottom: 0.5rem;
+                font-weight: 600;">Explanation</h3>
+            <div style="
+                color: var(--text-color);
+                line-height: 1.5;">
+                {explanation}
+            </div>
         </div>
     </div>
     """
 def update_chat_display(df, index):
+    """Update the chat visualization with improved dark theme support."""
     if df is None or df.empty or index >= len(df):
         return (
+            '<div style="padding: 1rem; color: var(--text-muted);">No data available</div>',
+            '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
+            '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
         )
     row = df.iloc[index]
     messages = json.loads(row["conversation"])
+    response = row["response"]
+    formatted_response = format_response(response)
+    # Create list of all messages including the response
+    all_messages = [
+        format_chat_message(msg["role"], msg["content"]) for msg in messages
+    ]
+    all_messages.append(
+        format_chat_message("Assistant", formatted_response, is_response=True)
+    )
     chat_html = f"""
+    <div style="
+        background-color: var(--surface-color);
+        border-radius: 8px;
+        border: 1px solid var(--border-color);
+        box-shadow: 0 2px 4px var(--shadow-color);
+        padding: 1.5rem;">
+        {"".join(all_messages)}
     </div>
     """
     metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
     tool_html = format_tool_info(row["tools_langchain"])
     return chat_html, metrics_html, tool_html
+def filter_and_update_display(model, dataset, min_score, max_score, current_index):
     try:
         df_chat = get_chat_and_score_df(model, dataset)
+        df_chat = df_chat[
+            (df_chat["score"] >= min_score) & (df_chat["score"] <= max_score)
+        ]
         if df_chat.empty:
             return (
+                '<div style="padding: 1rem; color: var(--text-muted);">No data available for selected filters</div>',
+                '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
+                '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
                 "0/0",
             )
         max_index = len(df_chat) - 1
         current_index = min(current_index, max_index)
         chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
+        index_display = f'<div style="font-weight: 500; color: var(--primary-text);">{current_index + 1}/{len(df_chat)}</div>'
+        return chat_html, metrics_html, tool_html, index_display
     except Exception as e:
+        error_html = f"""
+        <div style="
+            padding: 1rem;
+            color: var(--score-low);
+            background-color: var(--surface-color);
+            border: 1px solid var(--score-low);
+            border-radius: 4px;">
+            Error: {str(e)}
+        </div>
+        """
         return (
+            error_html,
+            '<div style="padding: 1rem; color: var(--text-muted);">No metrics available</div>',
+            '<div style="padding: 1rem; color: var(--text-muted);">No tool information available</div>',
             "0/0",
         )

data_loader.py CHANGED Viewed

@@ -1,11 +1,25 @@
 import pandas as pd
-from glob import glob
-import numpy as np
-from pathlib import Path
-DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
-SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]
 def load_data():
     """Load and preprocess the data."""

 import pandas as pd
+DATASETS = [
+    "BFCL_v3_irrelevance",
+    "BFCL_v3_multi_turn_base_multi_func_call",
+    "BFCL_v3_multi_turn_base_single_func_call",
+    "BFCL_v3_multi_turn_composite",
+    "BFCL_v3_multi_turn_long_context",
+    "BFCL_v3_multi_turn_miss_func",
+    "BFCL_v3_multi_turn_miss_param",
+    "tau_long_context",
+    "toolace_single_func_call_1",
+    "toolace_single_func_call_2",
+    "xlam_multiple_tool_multiple_call",
+    "xlam_multiple_tool_single_call",
+    "xlam_single_tool_multiple_call",
+    "xlam_single_tool_single_call",
+    "xlam_tool_miss",
+]
+SCORES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
 def load_data():
     """Load and preprocess the data."""

get_exp_data.ipynb ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from functools import lru_cache\n",
+    "from concurrent.futures import ThreadPoolExecutor\n",
+    "import promptquality as pq\n",
+    "from dotenv import load_dotenv\n",
+    "from data_loader import DATASETS, load_data\n",
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "load_dotenv()\n",
+    "pq.login(\"https://console.demo.rungalileo.io\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_name = \"agent-lb-v1\"\n",
+    "PROJECT_ID = pq.get_project_from_name(project_name).id\n",
+    "\n",
+    "\n",
+    "@lru_cache(maxsize=1000)\n",
+    "def get_output_df(model, dataset):\n",
+    "    print(f\"Getting metrics for {model} {project_name} for dataset {dataset}\")\n",
+    "    run_name = f\"{model} {dataset}\"\n",
+    "    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id\n",
+    "    rows = pq.get_rows(\n",
+    "        project_id=PROJECT_ID,\n",
+    "        run_id=run_id,\n",
+    "        task_type=None,\n",
+    "        config=None,\n",
+    "        starting_token=0,\n",
+    "        limit=1000,\n",
+    "    )\n",
+    "\n",
+    "    rationales = [d.metrics.tool_selection_quality_rationale for d in rows]\n",
+    "\n",
+    "    scores = [\n",
+    "        round(d.metrics.tool_selection_quality, 2)\n",
+    "        for d, rationale in zip(rows, rationales)\n",
+    "        if rationale\n",
+    "    ]\n",
+    "    \n",
+    "    explanations = [\n",
+    "        d.metrics.tool_selection_quality_explanation\n",
+    "        for d, rationale in zip(rows, rationales)\n",
+    "        if rationale\n",
+    "    ]\n",
+    "    \n",
+    "    responses = [d.response for d, rationale in zip(rows, rationales)\n",
+    "        if rationale\n",
+    "    ]\n",
+    "    \n",
+    "    rationales = [r for r in rationales if r]\n",
+    "    mean_score = round(np.mean(scores), 2)\n",
+    "    \n",
+    "    data = {\n",
+    "        \"response\": responses,\n",
+    "        \"mean_score\": mean_score,\n",
+    "        \"score\": scores,\n",
+    "        \"rationale\": rationales,\n",
+    "        \"explanation\": explanations,\n",
+    "    }\n",
+    "    return pd.DataFrame(data)\n",
+    "\n",
+    "def save_output_df(df, model, dataset):\n",
+    "    os.makedirs(f\"output/{model}\", exist_ok=True)\n",
+    "    df.to_parquet(f\"output/{model}/{dataset}.parquet\")\n",
+    "\n",
+    "def get_updated_df(df, df_output):\n",
+    "    df = df.iloc[:len(df_output)].copy()\n",
+    "    \n",
+    "    df[\"response\"] = df_output[\"response\"].tolist()\n",
+    "    df[\"rationale\"] = df_output[\"rationale\"].tolist()\n",
+    "    df[\"explanation\"] = df_output[\"explanation\"].tolist()\n",
+    "    df[\"score\"] = df_output[\"score\"].tolist()\n",
+    "    cols = ['conversation', 'tools_langchain', 'n_turns',\n",
+    "            'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']\n",
+    "    return df[cols]\n",
+    "\n",
+    "\n",
+    "def get_chat_and_score_df(model, dataset):\n",
+    "    df_output = pd.read_parquet(f\"output/{model}/{dataset}.parquet\")\n",
+    "    df = pd.read_parquet(f\"datasets/{dataset}.parquet\")\n",
+    "    df = get_updated_df(df, df_output)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_dataset(args):\n",
+    "    model, dataset = args\n",
+    "    if os.path.exists(f\"output/{model}/{dataset}.parquet\"):\n",
+    "        return None\n",
+    "    print(model, dataset)\n",
+    "    df_output = get_output_df(model, dataset)\n",
+    "    save_output_df(df_output, model, dataset)\n",
+    "    return f\"Completed: {model} - {dataset}\"\n",
+    "\n",
+    "def process_model_datasets(model, datasets, max_workers=5):\n",
+    "    with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
+    "        # Create arguments list for each dataset\n",
+    "        args_list = [(model, dataset) for dataset in datasets]\n",
+    "        \n",
+    "        # Process datasets in parallel with progress bar\n",
+    "        list(tqdm(\n",
+    "            executor.map(process_dataset, args_list),\n",
+    "            total=len(datasets),\n",
+    "            desc=f\"Datasets ({model})\",\n",
+    "            position=1,\n",
+    "            leave=False\n",
+    "        ))\n",
+    "\n",
+    "\n",
+    "models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n",
+    "# models = load_data()[\"Model\"]\n",
+    "\n",
+    "# Process each model sequentially, but datasets in parallel\n",
+    "for model in tqdm(models, desc=\"Models\", position=0):\n",
+    "    process_model_datasets(model, DATASETS)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langgraph",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_irrelevance.parquet ADDED Viewed

Binary file (36.4 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED Viewed

Binary file (25.4 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED Viewed

Binary file (22.9 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet ADDED Viewed

Binary file (42.4 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet ADDED Viewed

Binary file (38 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet ADDED Viewed

Binary file (41.6 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet ADDED Viewed

Binary file (42.7 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/tau_long_context.parquet ADDED Viewed

Binary file (47.1 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_1.parquet ADDED Viewed

Binary file (13.1 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/toolace_single_func_call_2.parquet ADDED Viewed

Binary file (11.5 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet ADDED Viewed

Binary file (104 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet ADDED Viewed

Binary file (39.3 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet ADDED Viewed

Binary file (30.4 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/xlam_single_tool_single_call.parquet ADDED Viewed

Binary file (43.8 kB). View file

output/Llama-3.3-70B-Instruct-Turbo/xlam_tool_miss.parquet ADDED Viewed

Binary file (49.4 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_irrelevance.parquet ADDED Viewed

Binary file (41.5 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED Viewed

Binary file (28.8 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED Viewed

Binary file (24.3 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_composite.parquet ADDED Viewed

Binary file (59 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_long_context.parquet ADDED Viewed

Binary file (45.7 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_func.parquet ADDED Viewed

Binary file (49.5 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/BFCL_v3_multi_turn_miss_param.parquet ADDED Viewed

Binary file (45.8 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/tau_long_context.parquet ADDED Viewed

Binary file (106 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_1.parquet ADDED Viewed

Binary file (18.3 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/toolace_single_func_call_2.parquet ADDED Viewed

Binary file (14.7 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_multiple_call.parquet ADDED Viewed

Binary file (103 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_multiple_tool_single_call.parquet ADDED Viewed

Binary file (39.9 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_multiple_call.parquet ADDED Viewed

Binary file (30.6 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_single_tool_single_call.parquet ADDED Viewed

Binary file (45 kB). View file

output/Meta-Llama-3.1-8B-Instruct-Turbo/xlam_tool_miss.parquet ADDED Viewed

Binary file (75.5 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_irrelevance.parquet ADDED Viewed

Binary file (56.5 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED Viewed

Binary file (25.8 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED Viewed

Binary file (24.7 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_composite.parquet ADDED Viewed

Binary file (50.6 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_long_context.parquet ADDED Viewed

Binary file (40.6 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_func.parquet ADDED Viewed

Binary file (49 kB). View file

output/claude-3-5-haiku-20241022/BFCL_v3_multi_turn_miss_param.parquet ADDED Viewed

Binary file (49.8 kB). View file

output/claude-3-5-haiku-20241022/tau_long_context.parquet ADDED Viewed

Binary file (42.4 kB). View file

output/claude-3-5-haiku-20241022/toolace_single_func_call_1.parquet ADDED Viewed

Binary file (19.9 kB). View file

output/claude-3-5-haiku-20241022/toolace_single_func_call_2.parquet ADDED Viewed

Binary file (13.8 kB). View file

output/claude-3-5-haiku-20241022/xlam_multiple_tool_multiple_call.parquet ADDED Viewed

Binary file (89.3 kB). View file

output/claude-3-5-haiku-20241022/xlam_multiple_tool_single_call.parquet ADDED Viewed

Binary file (40.8 kB). View file

output/claude-3-5-haiku-20241022/xlam_single_tool_multiple_call.parquet ADDED Viewed

Binary file (27.3 kB). View file

output/claude-3-5-haiku-20241022/xlam_single_tool_single_call.parquet ADDED Viewed

Binary file (49.3 kB). View file

output/claude-3-5-haiku-20241022/xlam_tool_miss.parquet ADDED Viewed

Binary file (56.6 kB). View file

output/claude-3-5-sonnet-20241022/BFCL_v3_irrelevance.parquet ADDED Viewed

Binary file (47.4 kB). View file