{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "from functools import lru_cache\n", "from concurrent.futures import ThreadPoolExecutor\n", "import promptquality as pq\n", "from dotenv import load_dotenv\n", "from data_loader import DATASETS, load_data\n", "from tqdm.auto import tqdm\n", "\n", "load_dotenv()\n", "pq.login(\"https://console.demo.rungalileo.io\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "project_name = \"agent-lb-v1\"\n", "PROJECT_ID = pq.get_project_from_name(project_name).id\n", "\n", "\n", "@lru_cache(maxsize=1000)\n", "def get_output_df(model, dataset):\n", " print(f\"Getting metrics for {model} {project_name} for dataset {dataset}\")\n", " run_name = f\"{model} {dataset}\"\n", " run_id = pq.get_run_from_name(run_name, PROJECT_ID).id\n", " rows = pq.get_rows(\n", " project_id=PROJECT_ID,\n", " run_id=run_id,\n", " task_type=None,\n", " config=None,\n", " starting_token=0,\n", " limit=1000,\n", " )\n", "\n", " rationales = [d.metrics.tool_selection_quality_rationale for d in rows]\n", "\n", " scores = [\n", " round(d.metrics.tool_selection_quality, 2)\n", " for d, rationale in zip(rows, rationales)\n", " if rationale\n", " ]\n", " \n", " explanations = [\n", " d.metrics.tool_selection_quality_explanation\n", " for d, rationale in zip(rows, rationales)\n", " if rationale\n", " ]\n", " \n", " responses = [d.response for d, rationale in zip(rows, rationales)\n", " if rationale\n", " ]\n", " \n", " rationales = [r for r in rationales if r]\n", " mean_score = round(np.mean(scores), 2)\n", " \n", " data = {\n", " \"response\": responses,\n", " \"mean_score\": mean_score,\n", " \"score\": scores,\n", " \"rationale\": rationales,\n", " \"explanation\": explanations,\n", " }\n", " return pd.DataFrame(data)\n", "\n", "def save_output_df(df, model, dataset):\n", " os.makedirs(f\"output/{model}\", exist_ok=True)\n", " df.to_parquet(f\"output/{model}/{dataset}.parquet\")\n", "\n", "def get_updated_df(df, df_output):\n", " df = df.iloc[:len(df_output)].copy()\n", " \n", " df[\"response\"] = df_output[\"response\"].tolist()\n", " df[\"rationale\"] = df_output[\"rationale\"].tolist()\n", " df[\"explanation\"] = df_output[\"explanation\"].tolist()\n", " df[\"score\"] = df_output[\"score\"].tolist()\n", " cols = ['conversation', 'tools_langchain', 'n_turns',\n", " 'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']\n", " return df[cols]\n", "\n", "\n", "def get_chat_and_score_df(model, dataset):\n", " df_output = pd.read_parquet(f\"output/{model}/{dataset}.parquet\")\n", " df = pd.read_parquet(f\"datasets/{dataset}.parquet\")\n", " df = get_updated_df(df, df_output)\n", " return df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_dataset(args):\n", " model, dataset = args\n", " if os.path.exists(f\"output/{model}/{dataset}.parquet\"):\n", " return None\n", " print(model, dataset)\n", " df_output = get_output_df(model, dataset)\n", " save_output_df(df_output, model, dataset)\n", " return f\"Completed: {model} - {dataset}\"\n", "\n", "def process_model_datasets(model, datasets, max_workers=5):\n", " with ThreadPoolExecutor(max_workers=max_workers) as executor:\n", " # Create arguments list for each dataset\n", " args_list = [(model, dataset) for dataset in datasets]\n", " \n", " # Process datasets in parallel with progress bar\n", " list(tqdm(\n", " executor.map(process_dataset, args_list),\n", " total=len(datasets),\n", " desc=f\"Datasets ({model})\",\n", " position=1,\n", " leave=False\n", " ))\n", "\n", "\n", "models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n", "# models = load_data()[\"Model\"]\n", "\n", "# Process each model sequentially, but datasets in parallel\n", "for model in tqdm(models, desc=\"Models\", position=0):\n", " process_model_datasets(model, DATASETS)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "langgraph", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 2 }