## This notebook is to show how to load csv data and into jsonl format for the LLM data cleaner.

First, we load the data.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Load tab-delimited file into pandas dataframe
cookies = pd.read_csv('../data/Cookies-AI-Gold-Standard - Cookies-AI-Gold-Standard.csv', sep=',')

cookies.head()

Unnamed: 0,sku,product_name (pos),brand (pos),product_category (pos),strain_name (pos),product_weight_grams (pos),brand (manual review),product_category (manual review),sub_product_category (manual review),strain_name (manual review),product_weight_grams (manual review)
0,bl-842922110296,STIIIZY - Birthday Cake Pod 1g,,VAPE PENS 1G,,1.0,STIIIZY,Vape,Vape,Birthday Cake,1.0
1,co-6ARLLX12,SMASH Hits - Hippie Slayer - Indoor - 1g,SMASH Hits,,Hippie Slayer,,SMASH Hits,Preroll,Joint,Hippie Slayer,1.0
2,bl-090035986141,Eighth Brothers - Black Jack 1g Preroll,,PREROLLS,,,Eighth Brothers,Preroll,Joint,Black Jack,1.0
3,bl-850002822274,GRIZZLY PEAK - Indica Bone 0.5g 7PK Prerolls,,PREROLL PACKS,,,GRIZZLY PEAK,Preroll,Joint,,3.5
4,co-76GP441T,Minntz - Emerald Cut - Indoor - Joint - 1g,Minntz,,Emerald Cut,,Minntz,Preroll,Joint,Emerald Cut,1.0


### Data Preparation
We transform the dataset into a pandas dataframe, with a column for prompt and completion.

The prompt contains the "dirty" columns, and completion contains the "cleaned" columns.

In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# split the dataset into train, val and test datasets 80/20
cookies_train, cookies_test = train_test_split(cookies, test_size=0.20, random_state=42)

# list of input and output columns
input_columns  = ['sku','product_name (pos)','brand (pos)','product_category (pos)','strain_name (pos)','product_weight_grams (pos)']
output_columns = ['brand (manual review)','product_category (manual review)','sub_product_category (manual review)','strain_name (manual review)','product_weight_grams (manual review)']

# functtion to convert pandas dataframe row to csv string
def row_to_csv(row):
    csv_string = ','.join(str(value) for value in row.values)
    return csv_string

# create dataframe with prompt and completion columns

# apply row_to_csv function to each row of the training dataframe
input_rows  = cookies_train[input_columns ].apply(row_to_csv, axis=1)
output_rows = cookies_train[output_columns].apply(row_to_csv, axis=1)

# create dataframe with prompt and completion columns for training dataset
prompt_df = pd.DataFrame(
    zip(input_rows,
        output_rows)
    , columns = ['prompt','completion'])

# save dataframe to jsonl file for training
prompt_df.to_json("../data/cookies_train.jsonl", orient='records', lines=True)

# apply row_to_csv function to each row of the test dataframe
input_test_rows  = cookies_test[input_columns ].apply(row_to_csv, axis=1)
output_test_rows = cookies_test[output_columns].apply(row_to_csv, axis=1)

# create dataframe with prompt and completion columns for test dataset
test_df = pd.DataFrame(
    zip(input_test_rows,
        output_test_rows)
    , columns = ['prompt','completion'])
test_df.head()

# save dataframe to jsonl file for test
test_df.to_json("../data/cookies_test.jsonl", orient='records', lines=True)

In [None]:
import pandas as pd

# write a function that samples n rows from a jsonl file
def sample_jsonl(path_or_buf='../data/cookies_train.jsonl',n_samples=5):    
    jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)
    return jsonObj.sample(n_samples, random_state=42)

In [None]:
# write a function that adds prompt and completion samples to messages
def add_samples(messages, n_samples=None):
    if n_samples is None:
        return messages
    samples = sample_jsonl(n_samples=n_samples)
    for i in range(n_samples):
        messages.append({"role": "user", "content": samples.iloc[i]['prompt']})
        messages.append({"role": "assistant", "content": samples.iloc[i]['completion']})
    return messages