{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "This Notebook is to test the various OpenAI models, prompts, and number of few-shot examples to see how they perform on the same task." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "!pip install wandb --upgrade openai datasets -qU" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "\n", "import openai\n", "\n", "# set OPENAI_API_KEY environment variable from .env file\n", "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", "\n", "# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file\n", "from openai_chat_completion import OpenAIChatCompletions\n", "from util import compare_completion_and_prediction" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Models:\n", "- gpt-3.5-turbo\n", "- gpt-4\n", "\n", "Prompts:\n", "- gpt4-system-message.txt\n", "\n", "Few-shot examples:\n", "> 0 ... 10" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "wandb setup:\n", "- entity: kaleidoscope-data\n", "- project: cookies_llm_experimental_eval\n", "- tags: gpt-3.5-turbo, gpt-4, gpt4-system-message, few-shot" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[32m\u001b[41mERROR\u001b[0m API key must be 40 characters long, yours was 48\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /home/cmagganas/.netrc\n" ] }, { "data": { "text/html": [ "Tracking run with wandb version 0.15.4" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in /home/cmagganas/kaleidoscope/llm_data_cleaner/app/wandb/run-20230626_114056-rbtf91s6" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run rose-puddle-7 to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View project at https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run at https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from wandb.integration.openai import autolog\n", "\n", "autolog({\"project\":\"cookies_llm_experimental_eval\",\n", " \"entity\": \"kaleidoscope-data\",\n", " \"group\": \"cookies\",\n", " \"job_type\": \"eval\"})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# create an empty dataframe to store predictions\n", "import pandas as pd\n", "predictions_df = pd.DataFrame(columns=['model', 'system_message', 'n_shot', 'prompt', 'completion', 'prediction'])\n", "\n", "models_to_test = [\"gpt-4\", \"gpt-3.5-turbo\"]\n", "sys_mes_to_test = [\"../prompts/gpt4-system-message.txt\", \"../prompts/gpt4-system-message2.txt\"] # names are arbitrary, same prompts but with \"####\" in system message 2\n", "n_shots_to_test = [None, 1, 2, 3, 5]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# if rerunning the below cell is required, set the following to True\n", "rerun = False\n", "if rerun:\n", " predictions_df = pd.read_csv('../data/cookies_llm_eval_predictions.csv')" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "# get predictions for all combinations of models, prompts, and n_shot values\n", "# save predictions to dataframe and then to csv in data folder after each iteration\n", "\n", "# loop through models_to_test\n", "for model in models_to_test:\n", " # loop through prompts_to_test\n", " for system_message in sys_mes_to_test:\n", " # instantiate OpenAIChatCompletions class\n", " chat = OpenAIChatCompletions(model=model, system_message=system_message)\n", " # loop through n_shots_to_test\n", " for n_shot in n_shots_to_test:\n", " sys_mes_var = 1 if system_message == \"../prompts/gpt4-system-message.txt\" else 2\n", " n_shot_var = 0 if n_shot == None else n_shot\n", " # check if predictions for this model, system_message, and n_shot value have already been made\n", " if predictions_df[(predictions_df['model'] == model) & (predictions_df['system_message'] == sys_mes_var) & (predictions_df['n_shot'] == n_shot_var)].shape[0] == 0:\n", " prompts, completions, predictions = chat.predict_jsonl(n_shot=n_shot)\n", " else:\n", " # skip if predictions for this model, system_message, and n_shot value have already been made\n", " continue\n", " # save predictions to dataframe\n", " df_to_append = pd.DataFrame({'model': model, 'system_message': sys_mes_var, 'n_shot': n_shot_var, 'prompt': prompts, 'completion': completions, 'prediction': predictions})\n", " df_right = df_to_append['prediction'].apply(pd.Series)\n", " df_right['prediction'] = df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n", " df_to_append = pd.concat([df_to_append[['model', 'system_message', 'n_shot', 'prompt', 'completion']], df_right], axis=1)\n", " df_to_append.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n", " # save predictions to dataframe\n", " predictions_df = pd.concat([predictions_df, df_to_append], ignore_index=True)\n", " # delete duplicates from dataframe\n", " predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]\n", " predictions_df.to_csv('../data/cookies_llm_eval_predictions.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [], "source": [ "predictions_df = predictions_df[~predictions_df.duplicated(subset=['model', 'system_message', 'n_shot', 'prompt'])]" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(400, 12)" ] }, "execution_count": 180, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions_df.shape" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "\n", "# ids = predictions_df['id'].isna()\n", "# # apply pd.Series to predictions column for rows where id is not null and change system_message {0,1} to {1,2}\n", "# new_df_right = predictions_df.loc[ids, 'prediction'].apply(pd.Series)\n", "# new_df_right['prediction'] = new_df_right['choices'].apply(lambda x: x[0]['message']['content']).drop(columns=['choices'])\n", "# new_df_left = predictions_df.loc[ids, ['model', 'system_message', 'n_shot', 'prompt', 'completion']].replace({0:1, 1:2})\n", "# new_df = pd.concat([new_df_left, new_df_right], axis=1)\n", "\n", "# predictions_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n", "# new_df.columns = ['model', 'system_message', 'n_shot', 'prompt', 'completion', 'id', 'object', 'created', 'openai_model', 'choices', 'usage', 'prediction']\n", "# predictions_df.loc[ids] = new_df" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "# for col in ['model','system_message','n_shot']:\n", "# print(predictions_df[col].value_counts())" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "# import numpy as np\n", "\n", "# # create a copy of predictions_df to manipulate\n", "# new_predictions_df = predictions_df\n", "\n", "# # replace names with 1 or 2\n", "# def replace_sys_mes_name(x):\n", "# if x == \"../prompts/gpt4-system-message.txt\":\n", "# return \"1\"\n", "# elif x == \"../prompts/gpt4-system-message2.txt\":\n", "# return \"2\"\n", "# else:\n", "# return x\n", "# new_predictions_df['system_message'] = new_predictions_df['system_message'].apply(lambda x: replace_sys_mes_name(x))\n", "# # replace None with 0\n", "# new_predictions_df['n_shot'] = new_predictions_df['n_shot'].apply(lambda x: 0 if x == None or np.nan else x)\n", "\n", "# # break up prediction column into sub columns by each of json keys\n", "# new_predictions_df = pd.concat([new_predictions_df, new_predictions_df['prediction'].apply(pd.Series)], axis=1)" ] }, { "cell_type": "code", "execution_count": 168, "metadata": {}, "outputs": [], "source": [ "# predictions_df.drop(columns=['num_correct'], inplace=True)" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelsystem_messagen_shotpromptcompletionidobjectcreatedopenai_modelchoicesusageprediction
0gpt-410co-2MFE5QVF,Chill Medicated - Watermelon - Syr...Chill Medicated,Edible,Beverage,nan,nanchatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsgchat.completion1.687805e+09gpt-4-0314[<OpenAIObject at 0x7fcf7fde94e0> JSON: {\\n \"...{\\n \"prompt_tokens\": 54,\\n \"completion_token...Hello! It looks like you mentioned a product: ...
1gpt-410bl-111630024545,Feelz - Space Cowboy 3.5g,nan,...Feelz,Flower,Bud,Space Cowboy,3.5chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2vchat.completion1.687805e+09gpt-4-0314[<OpenAIObject at 0x7fcf7f49d2b0> JSON: {\\n \"...{\\n \"prompt_tokens\": 51,\\n \"completion_token...Hello! It seems like you are referring to a pr...
2gpt-410fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh...Champelli,Flower,Bud,Xclusivo,3.5chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgRchat.completion1.687805e+09gpt-4-0314[<OpenAIObject at 0x7fcf7e306890> JSON: {\\n \"...{\\n \"prompt_tokens\": 71,\\n \"completion_token...Hello! It seems like you're interested in the ...
3gpt-410bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER...CAM,Flower,Bud,Mellowz #7,7chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9chat.completion1.687805e+09gpt-4-0314[<OpenAIObject at 0x7fcf7e33d940> JSON: {\\n \"...{\\n \"prompt_tokens\": 49,\\n \"completion_token...It seems like you are looking for information ...
4gpt-410fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato...Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan...chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpwchat.completion1.687805e+09gpt-4-0314[<OpenAIObject at 0x7fcf7e306980> JSON: {\\n \"...{\\n \"prompt_tokens\": 59,\\n \"completion_token...Hello! It seems like you are looking for infor...
.......................................
395gpt-3.5-turbo21co-76GP441T,Minntz - Emerald Cut - Indoor - Jo...Minntz,Preroll,Joint,Emerald Cut,1chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hkchat.completion1.687829e+09gpt-3.5-turbo-0301[{'index': 0, 'message': {'role': 'assistant',...{'prompt_tokens': 125, 'completion_tokens': 23...Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T.
396gpt-3.5-turbo21co-5RAWYHYQ,The Growers Circle - Double Down -...The Growers Circle,Flower,Bud,Double Down,3.5chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJchat.completion1.687829e+09gpt-3.5-turbo-0301[{'index': 0, 'message': {'role': 'assistant',...{'prompt_tokens': 123, 'completion_tokens': 22...The Growers Circle,Double Down,Indoor,3.5g,5RA...
397gpt-3.5-turbo21md-1195389,Blue Dream Roll Your Own Sugar Shak...Pacific Stone,Flower,Bud,nan,14chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYOchat.completion1.687829e+09gpt-3.5-turbo-0301[{'index': 0, 'message': {'role': 'assistant',...{'prompt_tokens': 119, 'completion_tokens': 20...Pacific Stone,Sugar Shake,Blue Dream,Roll Your...
398gpt-3.5-turbo21co-847ZXF37,The Grower Circle - Zoo Dawg x Cos...The Growers Circle,Preroll,Joint,Zoo Dawg x Co...chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdFchat.completion1.687829e+09gpt-3.5-turbo-0301[{'index': 0, 'message': {'role': 'assistant',...{'prompt_tokens': 133, 'completion_tokens': 32...Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ...
399gpt-3.5-turbo21co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1...Flight Bites,Edible,Gummies,nan,nanchatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrrachat.completion1.687829e+09gpt-3.5-turbo-0301[{'index': 0, 'message': {'role': 'assistant',...{'prompt_tokens': 129, 'completion_tokens': 21...Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ...
\n", "

400 rows × 12 columns

\n", "
" ], "text/plain": [ " model system_message n_shot \\\n", "0 gpt-4 1 0 \n", "1 gpt-4 1 0 \n", "2 gpt-4 1 0 \n", "3 gpt-4 1 0 \n", "4 gpt-4 1 0 \n", ".. ... ... ... \n", "395 gpt-3.5-turbo 2 1 \n", "396 gpt-3.5-turbo 2 1 \n", "397 gpt-3.5-turbo 2 1 \n", "398 gpt-3.5-turbo 2 1 \n", "399 gpt-3.5-turbo 2 1 \n", "\n", " prompt \\\n", "0 co-2MFE5QVF,Chill Medicated - Watermelon - Syr... \n", "1 bl-111630024545,Feelz - Space Cowboy 3.5g,nan,... \n", "2 fl-8voAjt83sD,Champelli | Xclusivo 3.5g | Eigh... \n", "3 bl-073133213364,CAM - Mellowz #7 7g,nan,FLOWER... \n", "4 fl-fwJQL2AWnS,Backpack Boyz | Bubblegum Gelato... \n", ".. ... \n", "395 co-76GP441T,Minntz - Emerald Cut - Indoor - Jo... \n", "396 co-5RAWYHYQ,The Growers Circle - Double Down -... \n", "397 md-1195389,Blue Dream Roll Your Own Sugar Shak... \n", "398 co-847ZXF37,The Grower Circle - Zoo Dawg x Cos... \n", "399 co-8EMW15ZM,Flight Bites - S'mores - Gummy - 1... \n", "\n", " completion \\\n", "0 Chill Medicated,Edible,Beverage,nan,nan \n", "1 Feelz,Flower,Bud,Space Cowboy,3.5 \n", "2 Champelli,Flower,Bud,Xclusivo,3.5 \n", "3 CAM,Flower,Bud,Mellowz #7,7 \n", "4 Backpack Boyz,Edible,CBD Tincture/Caps/etc,nan... \n", ".. ... \n", "395 Minntz,Preroll,Joint,Emerald Cut,1 \n", "396 The Growers Circle,Flower,Bud,Double Down,3.5 \n", "397 Pacific Stone,Flower,Bud,nan,14 \n", "398 The Growers Circle,Preroll,Joint,Zoo Dawg x Co... \n", "399 Flight Bites,Edible,Gummies,nan,nan \n", "\n", " id object created \\\n", "0 chatcmpl-7VlTkjAqXNRWfltMPpr5v37uBJIsg chat.completion 1.687805e+09 \n", "1 chatcmpl-7VlTtGF3RGsngfKB1BXufxoTixX2v chat.completion 1.687805e+09 \n", "2 chatcmpl-7VlU80b0m00VaiGymtj9dbqOggTgR chat.completion 1.687805e+09 \n", "3 chatcmpl-7VlUHqbsG2kpFHDxAWfsryh6pHmC9 chat.completion 1.687805e+09 \n", "4 chatcmpl-7VlUYvcad2wahIMHavhDEkYrgvjpw chat.completion 1.687805e+09 \n", ".. ... ... ... \n", "395 chatcmpl-7VrjRMvs2l8EJd4PVecpSRPCvV9Hk chat.completion 1.687829e+09 \n", "396 chatcmpl-7VrjT3wfVoLtq3G6xksfVtLz4FloJ chat.completion 1.687829e+09 \n", "397 chatcmpl-7VrjVafi1eGBXYfgmGBN0H3b0FzYO chat.completion 1.687829e+09 \n", "398 chatcmpl-7VrjWQpcRxJTdr3f4BUd7totDZpdF chat.completion 1.687829e+09 \n", "399 chatcmpl-7VrjXiUHiyUyH7udPXIjANVmAUrra chat.completion 1.687829e+09 \n", "\n", " openai_model choices \\\n", "0 gpt-4-0314 [ JSON: {\\n \"... \n", "1 gpt-4-0314 [ JSON: {\\n \"... \n", "2 gpt-4-0314 [ JSON: {\\n \"... \n", "3 gpt-4-0314 [ JSON: {\\n \"... \n", "4 gpt-4-0314 [ JSON: {\\n \"... \n", ".. ... ... \n", "395 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n", "396 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n", "397 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n", "398 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n", "399 gpt-3.5-turbo-0301 [{'index': 0, 'message': {'role': 'assistant',... \n", "\n", " usage \\\n", "0 {\\n \"prompt_tokens\": 54,\\n \"completion_token... \n", "1 {\\n \"prompt_tokens\": 51,\\n \"completion_token... \n", "2 {\\n \"prompt_tokens\": 71,\\n \"completion_token... \n", "3 {\\n \"prompt_tokens\": 49,\\n \"completion_token... \n", "4 {\\n \"prompt_tokens\": 59,\\n \"completion_token... \n", ".. ... \n", "395 {'prompt_tokens': 125, 'completion_tokens': 23... \n", "396 {'prompt_tokens': 123, 'completion_tokens': 22... \n", "397 {'prompt_tokens': 119, 'completion_tokens': 20... \n", "398 {'prompt_tokens': 133, 'completion_tokens': 32... \n", "399 {'prompt_tokens': 129, 'completion_tokens': 21... \n", "\n", " prediction \n", "0 Hello! It looks like you mentioned a product: ... \n", "1 Hello! It seems like you are referring to a pr... \n", "2 Hello! It seems like you're interested in the ... \n", "3 It seems like you are looking for information ... \n", "4 Hello! It seems like you are looking for infor... \n", ".. ... \n", "395 Minntz,Joint,Indoor,Emerald Cut,1g,co-76GP441T. \n", "396 The Growers Circle,Double Down,Indoor,3.5g,5RA... \n", "397 Pacific Stone,Sugar Shake,Blue Dream,Roll Your... \n", "398 Multi Joint,Zoo Dawg x Cosa Nostra,The Grower ... \n", "399 Flight Bites,Gummy,S'mores,10 count,100mg CO₂ ... \n", "\n", "[400 rows x 12 columns]" ] }, "execution_count": 181, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions_df" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "669" ] }, "execution_count": 182, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from util import compare_completion_and_prediction\n", "\n", "# Function that uses compare_completion_and_prediction to return num_correct and return zero if there is an error\n", "def get_num_correct(completion, prediction):\n", " try:\n", " return compare_completion_and_prediction(completion, prediction)['num_correct']\n", " except:\n", " return 0 # this will be the case when format is incorrect\n", " \n", "# Apply get_num_correct function to predictions_df dataframe\n", "predictions_df['num_correct'] = predictions_df.apply(lambda row: get_num_correct(row['completion'], row['prediction']), axis=1)\n", "predictions_df['num_correct'].sum() # out of 1000 possible correct predictions (20 samples * 5 cols per sample) * (2 system messages * 2 models * 5 n_shot values)" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "model system_message n_shot\n", "gpt-3.5-turbo 1 0 0.00\n", " 1 0.00\n", " 2 0 0.00\n", "gpt-4 1 0 0.00\n", " 1 0.00\n", " 2 0 0.00\n", "gpt-3.5-turbo 1 2 0.24\n", " 2 1 0.24\n", " 2 0.27\n", " 3 0.36\n", " 1 3 0.40\n", " 5 0.44\n", "gpt-4 2 2 0.45\n", " 1 2 0.45\n", " 2 1 0.47\n", "gpt-3.5-turbo 2 5 0.56\n", "gpt-4 1 3 0.62\n", " 2 3 0.67\n", " 5 0.73\n", " 1 5 0.79\n", "Name: num_correct, dtype: float64" ] }, "execution_count": 187, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions_df.groupby(['model', 'system_message', 'n_shot'])['num_correct'].sum().sort_values() / 100 # out of 100 possible correct predictions (20 samples * 5 cols per sample)" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "new_predictions_df.to_csv('../data/cookies_llm_eval_proc_preds.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Waiting for W&B process to finish... (success)." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

Run history:


usage/completion_tokens▆▆▁▁▁▁▁▁▁▁█▄▁▁▁▁▁▁▁▃▁▁▁▆▂▆▃▅▄▅▆▄▃▁▁▁▁▁▁▁
usage/elapsed_time▄▆▁▁▁▁▂▁▂▁█▃▁▁▁▂▁▁▂▁▁▁▁▄▂▄▂▃▃▄▅▂▁▁▁▁▂▁▁▁
usage/prompt_tokens▁▁▂▂▄▄▆▅██▁▁▃▃▄▅▅██▁▁▃▃▁▁▁▁▁▁▂▁▂▁▄▄▆▆██▁
usage/total_tokens▄▄▂▂▃▃▅▅█▇▆▃▂▂▄▅▅▇▇▂▁▃▂▄▂▄▂▄▃▄▄▃▂▄▃▅▆██▁

Run summary:


usage/completion_tokens62
usage/elapsed_time2.40086
usage/prompt_tokens54
usage/total_tokens116

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run rose-puddle-7 at: https://wandb.ai/kaleidoscope-data/cookies_llm_experimental_eval/runs/rbtf91s6
Synced 6 W&B file(s), 422 media file(s), 422 artifact file(s) and 0 other file(s)" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Find logs at: ./wandb/run-20230626_114056-rbtf91s6/logs" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "autolog.disable()" ] } ], "metadata": { "kernelspec": { "display_name": "kd-llm-dc", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }