In [None]:
import os
import pandas as pd
import numpy as np
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor
import promptquality as pq
from dotenv import load_dotenv
from data_loader import DATASETS, load_data
from tqdm.auto import tqdm

load_dotenv()
pq.login("https://console.demo.rungalileo.io")

In [2]:
project_name = "agent-lb-v1"
PROJECT_ID = pq.get_project_from_name(project_name).id


@lru_cache(maxsize=1000)
def get_output_df(model, dataset):
 print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
 run_name = f"{model} {dataset}"
 run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
 rows = pq.get_rows(
 project_id=PROJECT_ID,
 run_id=run_id,
 task_type=None,
 config=None,
 starting_token=0,
 limit=1000,
 )

 rationales = [d.metrics.tool_selection_quality_rationale for d in rows]

 scores = [
 round(d.metrics.tool_selection_quality, 2)
 for d, rationale in zip(rows, rationales)
 if rationale
 ]
 
 explanations = [
 d.metrics.tool_selection_quality_explanation
 for d, rationale in zip(rows, rationales)
 if rationale
 ]
 
 responses = [d.response for d, rationale in zip(rows, rationales)
 if rationale
 ]
 
 rationales = [r for r in rationales if r]
 mean_score = round(np.mean(scores), 2)
 
 data = {
 "response": responses,
 "mean_score": mean_score,
 "score": scores,
 "rationale": rationales,
 "explanation": explanations,
 }
 return pd.DataFrame(data)

def save_output_df(df, model, dataset):
 os.makedirs(f"output/{model}", exist_ok=True)
 df.to_parquet(f"output/{model}/{dataset}.parquet")

def get_updated_df(df, df_output):
 df = df.iloc[:len(df_output)].copy()
 
 df["response"] = df_output["response"].tolist()
 df["rationale"] = df_output["rationale"].tolist()
 df["explanation"] = df_output["explanation"].tolist()
 df["score"] = df_output["score"].tolist()
 cols = ['conversation', 'tools_langchain', 'n_turns',
 'len_query', 'n_tools', 'response', 'rationale', 'explanation', 'score']
 return df[cols]


def get_chat_and_score_df(model, dataset):
 df_output = pd.read_parquet(f"output/{model}/{dataset}.parquet")
 df = pd.read_parquet(f"datasets/{dataset}.parquet")
 df = get_updated_df(df, df_output)
 return df

In [None]:
def process_dataset(args):
 model, dataset = args
 if os.path.exists(f"output/{model}/{dataset}.parquet"):
 return None
 print(model, dataset)
 df_output = get_output_df(model, dataset)
 save_output_df(df_output, model, dataset)
 return f"Completed: {model} - {dataset}"

def process_model_datasets(model, datasets, max_workers=5):
 with ThreadPoolExecutor(max_workers=max_workers) as executor:
 # Create arguments list for each dataset
 args_list = [(model, dataset) for dataset in datasets]
 
 # Process datasets in parallel with progress bar
 list(tqdm(
 executor.map(process_dataset, args_list),
 total=len(datasets),
 desc=f"Datasets ({model})",
 position=1,
 leave=False
 ))


models = ["accounts/fireworks/models/qwen2p5-72b-instruct", "meta-llama/Llama-3.3-70B-Instruct-Turbo", "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]
# models = load_data()["Model"]

# Process each model sequentially, but datasets in parallel
for model in tqdm(models, desc="Models", position=0):
 process_model_datasets(model, DATASETS)
