Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,535 Bytes
b0ce6f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"from functools import lru_cache\n",
"from concurrent.futures import ThreadPoolExecutor\n",
"import promptquality as pq\n",
"from dotenv import load_dotenv\n",
"from data_loader import DATASETS, load_data\n",
"from tqdm.auto import tqdm\n",
"\n",
"load_dotenv()\n",
"pq.login(\"https://console.demo.rungalileo.io\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"project_name = \"agent-lb-v1\"\n",
"PROJECT_ID = pq.get_project_from_name(project_name).id\n",
"\n",
"\n",
"@lru_cache(maxsize=1000)\n",
"def get_output_df(model, dataset):\n",
" print(f\"Getting metrics for {model} {project_name} for dataset {dataset}\")\n",
" run_name = f\"{model} {dataset}\"\n",
" run_id = pq.get_run_from_name(run_name, PROJECT_ID).id\n",
" rows = pq.get_rows(\n",
" project_id=PROJECT_ID,\n",
" run_id=run_id,\n",
" task_type=None,\n",
" config=None,\n",
" starting_token=0,\n",
" limit=1000,\n",
" )\n",
"\n",
" rationales = [d.metrics.tool_selection_quality_rationale for d in rows]\n",
"\n",
" scores = [\n",
" round(d.metrics.tool_selection_quality, 2)\n",
" for d, rationale in zip(rows, rationales)\n",
" if rationale\n",
" ]\n",
" \n",
" explanations = [\n",
" d.metrics.tool_selection_quality_explanation\n",
" for d, rationale in zip(rows, rationales)\n",
" if rationale\n",
" ]\n",
" \n",
" responses = [d.response for d, rationale in zip(rows, rationales)\n",
" if rationale\n",
" ]\n",
" \n",
" rationales = [r for r in rationales if r]\n",
" mean_score = round(np.mean(scores), 2)\n",
" \n",
" data = {\n",
" \"response\": responses,\n",
" \"mean_score\": mean_score,\n",
" \"score\": scores,\n",
" \"rationale\": rationales,\n",
" \"explanation\": explanations,\n",
" }\n",
" return pd.DataFrame(data)\n",
"\n",
"def save_output_df(df, model, dataset):\n",
" os.makedirs(f\"output/{model}\", exist_ok=True)\n",
" df.to_parquet(f\"output/{model}/{dataset}.parquet\")\n",
"\n",
"def get_updated_df(df, df_output):\n",
" df = df.iloc[:len(df_output)].copy()\n",
" \n",
" df[\"response\"] = df_output[\"response\"].tolist()\n",
" df[\"rationale\"] = df_output[\"rationale\"].tolist()\n",
" df[\"explanation\"] = df_output[\"explanation\"].tolist()\n",
" df[\"score\"] = df_output[\"score\"].tolist()\n",
" cols = ['conversation', 'tools_langchain', 'n_turns',\n",
" 'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']\n",
" return df[cols]\n",
"\n",
"\n",
"def get_chat_and_score_df(model, dataset):\n",
" df_output = pd.read_parquet(f\"output/{model}/{dataset}.parquet\")\n",
" df = pd.read_parquet(f\"datasets/{dataset}.parquet\")\n",
" df = get_updated_df(df, df_output)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process_dataset(args):\n",
" model, dataset = args\n",
" if os.path.exists(f\"output/{model}/{dataset}.parquet\"):\n",
" return None\n",
" print(model, dataset)\n",
" df_output = get_output_df(model, dataset)\n",
" save_output_df(df_output, model, dataset)\n",
" return f\"Completed: {model} - {dataset}\"\n",
"\n",
"def process_model_datasets(model, datasets, max_workers=5):\n",
" with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
" # Create arguments list for each dataset\n",
" args_list = [(model, dataset) for dataset in datasets]\n",
" \n",
" # Process datasets in parallel with progress bar\n",
" list(tqdm(\n",
" executor.map(process_dataset, args_list),\n",
" total=len(datasets),\n",
" desc=f\"Datasets ({model})\",\n",
" position=1,\n",
" leave=False\n",
" ))\n",
"\n",
"\n",
"models = [\"accounts/fireworks/models/qwen2p5-72b-instruct\", \"meta-llama/Llama-3.3-70B-Instruct-Turbo\", \"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo\"]\n",
"# models = load_data()[\"Model\"]\n",
"\n",
"# Process each model sequentially, but datasets in parallel\n",
"for model in tqdm(models, desc=\"Models\", position=0):\n",
" process_model_datasets(model, DATASETS)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "langgraph",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|