{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This Notebook is to test the various OpenAI models, prompts, and number of few-shot examples to see how they perform on the same task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install wandb --upgrade openai datasets -qU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "\n",
    "import openai\n",
    "\n",
    "# set OPENAI_API_KEY environment variable from .env file\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file\n",
    "from openai_chat_completion import OpenAIChatCompletions\n",
    "from util import compare_completion_and_prediction"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Models:\n",
    "- gpt-3.5-turbo\n",
    "- gpt-4\n",
    "\n",
    "Prompts:\n",
    "- gpt4-system-message.txt\n",
    "\n",
    "Few-shot examples:\n",
    "> 0 ... 10"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "wandb setup:\n",
    "- entity: kaleidoscope-data\n",
    "- project: cookies_llm_experimental_eval\n",
    "- tags: gpt-3.5-turbo, gpt-4, gpt4-system-message, few-shot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[32m\u001b[41mERROR\u001b[0m API key must be 40 characters long, yours was 48\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /home/cmagganas/.netrc\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "Tracking run with wandb version 0.15.4"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run data is saved locally in <code>/home/cmagganas/kaleidoscope/llm_data_cleaner/app/wandb/run-20230626_114056-rbtf91s6</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Syncing run <strong><a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">rose-puddle-7</a></strong> to <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View project at <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run at <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from wandb.integration.openai import autolog\n",
    "\n",
    "autolog({\"project\":\"cookies_llm_experimental_eval\",\n",
    "         \"entity\": \"kaleidoscope-data\",\n",
    "         \"group\": \"cookies\",\n",
    "         \"job_type\": \"eval\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create an empty dataframe to store predictions\n",
    "import pandas as pd\n",
    "predictions_df = pd.DataFrame(columns=['model', 'system_message', 'n_shot', 'prompt', 'completion', 'prediction'])\n",
    "\n",
    "models_to_test = [\"gpt-4\", \"gpt-3.5-turbo\"]\n",
    "sys_mes_to_test = [\"../prompts/gpt4-system-message.txt\", \"../prompts/gpt4-system-message2.txt\"] # names are arbitrary, same prompts but with \"####\" in system message 2\n",
    "n_shots_to_test = [None, 1, 2, 3, 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if rerunning the below cell is required, set the following to True\n",
    "rerun = False\n",
    "if rerun:\n",
    "    predictions_df = pd.read_csv('../data/cookies_llm_eval_predictions.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get predictions for all combinations of models, prompts, and n_shot values\n",
    "# save predictions to dataframe and then to csv in data folder after each iteration\n",
    "\n",
    "# loop through models_to_test\n",
    "for model in models_to_test:\n",
    "    # loop through prompts_to_test\n",
    "    for system_message in sys_mes_to_test:\n",
    "            # instantiate OpenAIChatCompletions class\n",
    "            chat = OpenAIChatCompletions(model=model, system_message=system_message)\n",
    "            # loop through n_shots_to_test\n",
    "            for n_shot in n_shots_to_test:\n",
    "                sys_mes_var = 1 if system_message == \"../prompts/gpt4-system-message.txt\" else 2\n",
    "                n_shot_var = 0 if n_shot == None else n_shot\n",
    "                # check if predictions for this model, system_message, and n_shot value have already been made\n",
    "                if predictions_df[(predictions_df['model'] == model) & (predictions_df['system_message'] == sys_mes_var) & (predictions_df['n_shot'] == n_shot_var)].shape[0] == 0:\n",
    "                    prompts, completions, predictions = chat.predict_jsonl(n_shot=n_shot)\n",
    "                else:\n",
    "                     # skip if predictions for this model, system_message, and n_shot value have already been made\n",
    "                    continue\n",
    "                # save predictions to dataframe\n",
    "                df_to_append = pd.DataFrame({'model': model, 'system_message': sys_mes_var, 'n_shot': n_shot_var, 'prompt': prompts, 'completion': completions, 'prediction': predictions})\n",
    "                df_right = df_to_append['prediction'].apply(pd.Series)\n",
    "                df_right['prediction'] = df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n",
    "                df_to_append = pd.concat([df_to_append[['model', 'system_message', 'n_shot', 'prompt', 'completion']], df_right], axis=1)\n",
    "                df_to_append.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
    "                # save predictions to dataframe\n",
    "                predictions_df = pd.concat([predictions_df, df_to_append], ignore_index=True)\n",
    "                # delete duplicates from dataframe\n",
    "                predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]\n",
    "                predictions_df.to_csv('../data/cookies_llm_eval_predictions.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(400, 12)"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "\n",
    "# ids = predictions_df['id'].isna()\n",
    "# # apply pd.Series to predictions column for rows where id is not null and change system_message {0,1} to {1,2}\n",
    "# new_df_right = predictions_df.loc[ids, 'prediction'].apply(pd.Series)\n",
    "# new_df_right['prediction'] = new_df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n",
    "# new_df_left = predictions_df.loc[ids, ['model', 'system_message', 'n_shot', 'prompt', 'completion']].replace({0:1, 1:2})\n",
    "# new_df = pd.concat([new_df_left, new_df_right], axis=1)\n",
    "\n",
    "# predictions_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
    "# new_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n",
    "# predictions_df.loc[ids] = new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for col in ['model','system_message','n_shot']:\n",
    "#     print(predictions_df[col].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "\n",
    "# # create a copy of predictions_df to manipulate\n",
    "# new_predictions_df = predictions_df\n",
    "\n",
    "# # replace names with 1 or 2\n",
    "# def replace_sys_mes_name(x):\n",
    "#     if x == \"../prompts/gpt4-system-message.txt\":\n",
    "#         return \"1\"\n",
    "#     elif x == \"../prompts/gpt4-system-message2.txt\":\n",
    "#         return \"2\"\n",
    "#     else:\n",
    "#         return x\n",
    "# new_predictions_df['system_message'] = new_predictions_df['system_message'].apply(lambda x: replace_sys_mes_name(x))\n",
    "# # replace None with 0\n",
    "# new_predictions_df['n_shot'] = new_predictions_df['n_shot'].apply(lambda x: 0 if x == None or np.nan else x)\n",
    "\n",
    "# # break up prediction column into sub columns by each of json keys\n",
    "# new_predictions_df = pd.concat([new_predictions_df, new_predictions_df['prediction'].apply(pd.Series)], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predictions_df.drop(columns=['num_correct'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model</th>\n",
       "      <th>system_message</th>\n",
       "      <th>n_shot</th>\n",
       "      <th>prompt</th>\n",
       "      <th>completion</th>\n",
       "      <th>id</th>\n",
       "      <th>object</th>\n",
       "      <th>created</th>\n",
       "      <th>openai_model</th>\n",
       "      <th>choices</th>\n",
       "      <th>usage</th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gpt-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>co-2MFE5QVF,Chill Medicated - Watermelon - Syr...</td>\n",
       "      <td>Chill Medicated,Edible,Beverage,nan,nan</td>\n",
       "      <td>chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687805e+09</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>[&lt;OpenAIObject at 0x7fcf7fde94e0&gt; JSON: {\\n  \"...</td>\n",
       "      <td>{\\n  \"prompt_tokens\": 54,\\n  \"completion_token...</td>\n",
       "      <td>Hello! It looks like you mentioned a product: ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gpt-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>bl-111630024545,Feelz - Space Cowboy 3.5g,nan,...</td>\n",
       "      <td>Feelz,Flower,Bud,Space Cowboy,3.5</td>\n",
       "      <td>chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687805e+09</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>[&lt;OpenAIObject at 0x7fcf7f49d2b0&gt; JSON: {\\n  \"...</td>\n",
       "      <td>{\\n  \"prompt_tokens\": 51,\\n  \"completion_token...</td>\n",
       "      <td>Hello! It seems like you are referring to a pr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gpt-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh...</td>\n",
       "      <td>Champelli,Flower,Bud,Xclusivo,3.5</td>\n",
       "      <td>chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687805e+09</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>[&lt;OpenAIObject at 0x7fcf7e306890&gt; JSON: {\\n  \"...</td>\n",
       "      <td>{\\n  \"prompt_tokens\": 71,\\n  \"completion_token...</td>\n",
       "      <td>Hello! It seems like you're interested in the ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>gpt-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER...</td>\n",
       "      <td>CAM,Flower,Bud,Mellowz #7,7</td>\n",
       "      <td>chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687805e+09</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>[&lt;OpenAIObject at 0x7fcf7e33d940&gt; JSON: {\\n  \"...</td>\n",
       "      <td>{\\n  \"prompt_tokens\": 49,\\n  \"completion_token...</td>\n",
       "      <td>It seems like you are looking for information ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>gpt-4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato...</td>\n",
       "      <td>Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan...</td>\n",
       "      <td>chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687805e+09</td>\n",
       "      <td>gpt-4-0314</td>\n",
       "      <td>[&lt;OpenAIObject at 0x7fcf7e306980&gt; JSON: {\\n  \"...</td>\n",
       "      <td>{\\n  \"prompt_tokens\": 59,\\n  \"completion_token...</td>\n",
       "      <td>Hello! It seems like you are looking for infor...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>395</th>\n",
       "      <td>gpt-3.5-turbo</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>co-76GP441T,Minntz - Emerald Cut - Indoor - Jo...</td>\n",
       "      <td>Minntz,Preroll,Joint,Emerald Cut,1</td>\n",
       "      <td>chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687829e+09</td>\n",
       "      <td>gpt-3.5-turbo-0301</td>\n",
       "      <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
       "      <td>{'prompt_tokens': 125, 'completion_tokens': 23...</td>\n",
       "      <td>Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>396</th>\n",
       "      <td>gpt-3.5-turbo</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>co-5RAWYHYQ,The Growers Circle - Double Down -...</td>\n",
       "      <td>The Growers Circle,Flower,Bud,Double Down,3.5</td>\n",
       "      <td>chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687829e+09</td>\n",
       "      <td>gpt-3.5-turbo-0301</td>\n",
       "      <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
       "      <td>{'prompt_tokens': 123, 'completion_tokens': 22...</td>\n",
       "      <td>The Growers Circle,Double Down,Indoor,3.5g,5RA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>397</th>\n",
       "      <td>gpt-3.5-turbo</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>md-1195389,Blue Dream Roll Your Own Sugar Shak...</td>\n",
       "      <td>Pacific Stone,Flower,Bud,nan,14</td>\n",
       "      <td>chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687829e+09</td>\n",
       "      <td>gpt-3.5-turbo-0301</td>\n",
       "      <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
       "      <td>{'prompt_tokens': 119, 'completion_tokens': 20...</td>\n",
       "      <td>Pacific Stone,Sugar Shake,Blue Dream,Roll Your...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>398</th>\n",
       "      <td>gpt-3.5-turbo</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>co-847ZXF37,The Grower Circle - Zoo Dawg x Cos...</td>\n",
       "      <td>The Growers Circle,Preroll,Joint,Zoo Dawg x Co...</td>\n",
       "      <td>chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687829e+09</td>\n",
       "      <td>gpt-3.5-turbo-0301</td>\n",
       "      <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
       "      <td>{'prompt_tokens': 133, 'completion_tokens': 32...</td>\n",
       "      <td>Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>399</th>\n",
       "      <td>gpt-3.5-turbo</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1...</td>\n",
       "      <td>Flight Bites,Edible,Gummies,nan,nan</td>\n",
       "      <td>chatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrra</td>\n",
       "      <td>chat.completion</td>\n",
       "      <td>1.687829e+09</td>\n",
       "      <td>gpt-3.5-turbo-0301</td>\n",
       "      <td>[{'index': 0, 'message': {'role': 'assistant',...</td>\n",
       "      <td>{'prompt_tokens': 129, 'completion_tokens': 21...</td>\n",
       "      <td>Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>400 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             model  system_message  n_shot  \\\n",
       "0            gpt-4               1       0   \n",
       "1            gpt-4               1       0   \n",
       "2            gpt-4               1       0   \n",
       "3            gpt-4               1       0   \n",
       "4            gpt-4               1       0   \n",
       "..             ...             ...     ...   \n",
       "395  gpt-3.5-turbo               2       1   \n",
       "396  gpt-3.5-turbo               2       1   \n",
       "397  gpt-3.5-turbo               2       1   \n",
       "398  gpt-3.5-turbo               2       1   \n",
       "399  gpt-3.5-turbo               2       1   \n",
       "\n",
       "                                                prompt  \\\n",
       "0    co-2MFE5QVF,Chill Medicated - Watermelon - Syr...   \n",
       "1    bl-111630024545,Feelz - Space Cowboy 3.5g,nan,...   \n",
       "2    fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh...   \n",
       "3    bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER...   \n",
       "4    fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato...   \n",
       "..                                                 ...   \n",
       "395  co-76GP441T,Minntz - Emerald Cut - Indoor - Jo...   \n",
       "396  co-5RAWYHYQ,The Growers Circle - Double Down -...   \n",
       "397  md-1195389,Blue Dream Roll Your Own Sugar Shak...   \n",
       "398  co-847ZXF37,The Grower Circle - Zoo Dawg x Cos...   \n",
       "399  co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1...   \n",
       "\n",
       "                                            completion  \\\n",
       "0              Chill Medicated,Edible,Beverage,nan,nan   \n",
       "1                    Feelz,Flower,Bud,Space Cowboy,3.5   \n",
       "2                    Champelli,Flower,Bud,Xclusivo,3.5   \n",
       "3                          CAM,Flower,Bud,Mellowz #7,7   \n",
       "4    Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan...   \n",
       "..                                                 ...   \n",
       "395                 Minntz,Preroll,Joint,Emerald Cut,1   \n",
       "396      The Growers Circle,Flower,Bud,Double Down,3.5   \n",
       "397                    Pacific Stone,Flower,Bud,nan,14   \n",
       "398  The Growers Circle,Preroll,Joint,Zoo Dawg x Co...   \n",
       "399                Flight Bites,Edible,Gummies,nan,nan   \n",
       "\n",
       "                                         id           object       created  \\\n",
       "0    chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg  chat.completion  1.687805e+09   \n",
       "1    chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v  chat.completion  1.687805e+09   \n",
       "2    chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR  chat.completion  1.687805e+09   \n",
       "3    chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9  chat.completion  1.687805e+09   \n",
       "4    chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw  chat.completion  1.687805e+09   \n",
       "..                                      ...              ...           ...   \n",
       "395  chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk  chat.completion  1.687829e+09   \n",
       "396  chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ  chat.completion  1.687829e+09   \n",
       "397  chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO  chat.completion  1.687829e+09   \n",
       "398  chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF  chat.completion  1.687829e+09   \n",
       "399  chatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrra  chat.completion  1.687829e+09   \n",
       "\n",
       "           openai_model                                            choices  \\\n",
       "0            gpt-4-0314  [<OpenAIObject at 0x7fcf7fde94e0> JSON: {\\n  \"...   \n",
       "1            gpt-4-0314  [<OpenAIObject at 0x7fcf7f49d2b0> JSON: {\\n  \"...   \n",
       "2            gpt-4-0314  [<OpenAIObject at 0x7fcf7e306890> JSON: {\\n  \"...   \n",
       "3            gpt-4-0314  [<OpenAIObject at 0x7fcf7e33d940> JSON: {\\n  \"...   \n",
       "4            gpt-4-0314  [<OpenAIObject at 0x7fcf7e306980> JSON: {\\n  \"...   \n",
       "..                  ...                                                ...   \n",
       "395  gpt-3.5-turbo-0301  [{'index': 0, 'message': {'role': 'assistant',...   \n",
       "396  gpt-3.5-turbo-0301  [{'index': 0, 'message': {'role': 'assistant',...   \n",
       "397  gpt-3.5-turbo-0301  [{'index': 0, 'message': {'role': 'assistant',...   \n",
       "398  gpt-3.5-turbo-0301  [{'index': 0, 'message': {'role': 'assistant',...   \n",
       "399  gpt-3.5-turbo-0301  [{'index': 0, 'message': {'role': 'assistant',...   \n",
       "\n",
       "                                                 usage  \\\n",
       "0    {\\n  \"prompt_tokens\": 54,\\n  \"completion_token...   \n",
       "1    {\\n  \"prompt_tokens\": 51,\\n  \"completion_token...   \n",
       "2    {\\n  \"prompt_tokens\": 71,\\n  \"completion_token...   \n",
       "3    {\\n  \"prompt_tokens\": 49,\\n  \"completion_token...   \n",
       "4    {\\n  \"prompt_tokens\": 59,\\n  \"completion_token...   \n",
       "..                                                 ...   \n",
       "395  {'prompt_tokens': 125, 'completion_tokens': 23...   \n",
       "396  {'prompt_tokens': 123, 'completion_tokens': 22...   \n",
       "397  {'prompt_tokens': 119, 'completion_tokens': 20...   \n",
       "398  {'prompt_tokens': 133, 'completion_tokens': 32...   \n",
       "399  {'prompt_tokens': 129, 'completion_tokens': 21...   \n",
       "\n",
       "                                            prediction  \n",
       "0    Hello! It looks like you mentioned a product: ...  \n",
       "1    Hello! It seems like you are referring to a pr...  \n",
       "2    Hello! It seems like you're interested in the ...  \n",
       "3    It seems like you are looking for information ...  \n",
       "4    Hello! It seems like you are looking for infor...  \n",
       "..                                                 ...  \n",
       "395    Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T.  \n",
       "396  The Growers Circle,Double Down,Indoor,3.5g,5RA...  \n",
       "397  Pacific Stone,Sugar Shake,Blue Dream,Roll Your...  \n",
       "398  Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ...  \n",
       "399  Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ...  \n",
       "\n",
       "[400 rows x 12 columns]"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "669"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from util import compare_completion_and_prediction\n",
    "\n",
    "# Function that uses compare_completion_and_prediction to return num_correct and return zero if there is an error\n",
    "def get_num_correct(completion, prediction):\n",
    "    try:\n",
    "        return compare_completion_and_prediction(completion, prediction)['num_correct']\n",
    "    except:\n",
    "        return 0 # this will be the case when format is incorrect\n",
    "        \n",
    "# Apply get_num_correct function to predictions_df dataframe\n",
    "predictions_df['num_correct'] = predictions_df.apply(lambda row: get_num_correct(row['completion'], row['prediction']), axis=1)\n",
    "predictions_df['num_correct'].sum() # out of 1000 possible correct predictions (20 samples * 5 cols per sample) * (2 system messages * 2 models * 5 n_shot values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "model          system_message  n_shot\n",
       "gpt-3.5-turbo  1               0         0.00\n",
       "                               1         0.00\n",
       "               2               0         0.00\n",
       "gpt-4          1               0         0.00\n",
       "                               1         0.00\n",
       "               2               0         0.00\n",
       "gpt-3.5-turbo  1               2         0.24\n",
       "               2               1         0.24\n",
       "                               2         0.27\n",
       "                               3         0.36\n",
       "               1               3         0.40\n",
       "                               5         0.44\n",
       "gpt-4          2               2         0.45\n",
       "               1               2         0.45\n",
       "               2               1         0.47\n",
       "gpt-3.5-turbo  2               5         0.56\n",
       "gpt-4          1               3         0.62\n",
       "               2               3         0.67\n",
       "                               5         0.73\n",
       "               1               5         0.79\n",
       "Name: num_correct, dtype: float64"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions_df.groupby(['model', 'system_message', 'n_shot'])['num_correct'].sum().sort_values() / 100 # out of 100 possible correct predictions (20 samples * 5 cols per sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_predictions_df.to_csv('../data/cookies_llm_eval_proc_preds.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<style>\n",
       "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
       "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
       "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
       "    </style>\n",
       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>usage/completion_tokens</td><td>▆▆▁▁▁▁▁▁▁▁█▄▁▁▁▁▁▁▁▃▁▁▁▆▂▆▃▅▄▅▆▄▃▁▁▁▁▁▁▁</td></tr><tr><td>usage/elapsed_time</td><td>▄▆▁▁▁▁▂▁▂▁█▃▁▁▁▂▁▁▂▁▁▁▁▄▂▄▂▃▃▄▅▂▁▁▁▁▂▁▁▁</td></tr><tr><td>usage/prompt_tokens</td><td>▁▁▂▂▄▄▆▅██▁▁▃▃▄▅▅██▁▁▃▃▁▁▁▁▁▁▂▁▂▁▄▄▆▆██▁</td></tr><tr><td>usage/total_tokens</td><td>▄▄▂▂▃▃▅▅█▇▆▃▂▂▄▅▅▇▇▂▁▃▂▄▂▄▂▄▃▄▄▃▂▄▃▅▆██▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>usage/completion_tokens</td><td>62</td></tr><tr><td>usage/elapsed_time</td><td>2.40086</td></tr><tr><td>usage/prompt_tokens</td><td>54</td></tr><tr><td>usage/total_tokens</td><td>116</td></tr></table><br/></div></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run <strong style=\"color:#cdcd00\">rose-puddle-7</strong> at: <a href='https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6' target=\"_blank\">https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6</a><br/>Synced 6 W&B file(s), 422 media file(s), 422 artifact file(s) and 0 other file(s)"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Find logs at: <code>./wandb/run-20230626_114056-rbtf91s6/logs</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "autolog.disable()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kd-llm-dc",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}