cmagganas commited on
Commit
f41d972
1 Parent(s): d6674c7

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .vscode/settings.json +7 -0
  2. app/app.py +35 -12
  3. app/dataclean_hf.py +151 -0
.vscode/settings.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cSpell.words": [
3
+ "openai",
4
+ "sqlalchemy",
5
+ "streamlit"
6
+ ]
7
+ }
app/app.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import streamlit as st
4
  from openai_chat_completion import OpenAIChatCompletions
 
5
 
6
  st.title("Kaleidoscope Data - Data Cleaning LLM App")
7
 
@@ -9,22 +10,44 @@ st.write("This app is a demo of the LLM model for data cleaning. It is a work in
9
 
10
  # text box or csv upload
11
  text_input = st.text_input("Enter text", "")
12
- # csv_file = st.file_uploader("Upload CSV", type=['csv'])
13
 
14
  # button to run data cleaning API on text via c class in openai_chat_completion.py
15
  if st.button("Run Data Cleaning API"):
16
 
17
  # if text_input is not empty, run data cleaning API on text_input
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- model = "gpt-4" # "gpt-3.5-turbo"
20
- sys_mes = "prompts/gpt4-system-message.txt"
21
-
22
- # instantiate OpenAIChatCompletions class
23
- # get response from openai_chat_completion method
24
- chat = OpenAIChatCompletions(model=model, system_message=sys_mes)
25
- response = chat.openai_chat_completion(text_input, n_shot=5)
26
-
27
-
28
- # display response
29
- st.write(response['choices'][0]['message']['content'])
 
 
 
 
 
 
 
 
 
30
 
 
2
 
3
  import streamlit as st
4
  from openai_chat_completion import OpenAIChatCompletions
5
+ from dataclean_hf import main
6
 
7
  st.title("Kaleidoscope Data - Data Cleaning LLM App")
8
 
 
10
 
11
  # text box or csv upload
12
  text_input = st.text_input("Enter text", "")
13
+ csv_file = st.file_uploader("Upload CSV", type=['csv'])
14
 
15
  # button to run data cleaning API on text via c class in openai_chat_completion.py
16
  if st.button("Run Data Cleaning API"):
17
 
18
  # if text_input is not empty, run data cleaning API on text_input
19
+ if text_input:
20
+
21
+ model = "gpt-4" # "gpt-3.5-turbo"
22
+ sys_mes = "prompts/gpt4-system-message.txt"
23
+
24
+ # instantiate OpenAIChatCompletions class
25
+ # get response from openai_chat_completion method
26
+ chat = OpenAIChatCompletions(model=model, system_message=sys_mes)
27
+ response = chat.openai_chat_completion(text_input, n_shot=5)
28
+
29
+
30
+ # display response
31
+ st.write(response['choices'][0]['message']['content'])
32
 
33
+ # if csv_file is not empty, run data cleaning API on csv_file
34
+ elif csv_file:
35
+
36
+ # run data cleaning API on csv_file
37
+ output_df = main(csv_file)
38
+
39
+ @st.cache
40
+ def convert_df(df):
41
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
42
+ return df.to_csv().encode('utf-8')
43
+
44
+ csv = convert_df(output_df)
45
+
46
+ st.download_button(
47
+ label="Download data as CSV",
48
+ data=csv,
49
+ file_name='cleaned_df.csv',
50
+ mime='text/csv',
51
+ )
52
+
53
 
app/dataclean_hf.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from io import StringIO
3
+ import csv
4
+ import pandas as pd
5
+ import numpy as np
6
+ import streamlit as st
7
+
8
+ from sqlalchemy import create_engine
9
+ from yallmf.utils import run_with_timeout
10
+ import openai
11
+
12
+ from tqdm import tqdm
13
+
14
+
15
+ openai.api_key = st.secrets["OPENAI_API_KEY"]
16
+
17
+ def clean_data(
18
+ input_product_names: pd.Series,
19
+ input_brands: pd.Series,
20
+ input_product_categories: pd.Series,
21
+ category_taxonomy: dict):
22
+
23
+ output_cols = ['brand', 'product_category', 'sub_product_category', 'strain_name']
24
+ ncols = len(output_cols)
25
+
26
+ p1 = f'''
27
+ I am going to provide a data set of marijuana products and their metadata. Using the information I provide, I want you to provide me with the following information about the products.
28
+
29
+ - Brand (brand)
30
+ - product category (product_category)
31
+ - sub product category (sub_product_category)
32
+ - strain name (strain_name)
33
+
34
+ The following JSON shows all the acceptable Product Categories and their Sub Product Categories. Strictly adhere to the below mapping for valid product_category to sub_product_category relationships:
35
+
36
+ {json.dumps(category_taxonomy)}
37
+
38
+ Additional requirements:
39
+
40
+ - The input data set in CSV format, with commas as field delimiter and newline as row delimiter.
41
+ - Do not automatically assume that the information in the data set I provide is accurate.
42
+ - Leave the 'sub_product_category' field blank unless there's a clear and direct match with one of the categories provided in the list.If there is no explicit information to confidently assign a sub_product_category, default to leaving it blank.
43
+ - Strain names are only applicable for the following product categories: concentrate, preroll, vape, flower
44
+ - Look for clues in the product name to determine what brand/ product category/ sub product category/ and strain name the product should fall under. For Vape products, consider the words before 'Cartridge' or 'Cart' in the product name as potential strain names.
45
+ - Every row of the Output CSV must have EXACTLY {ncols} columns.
46
+ - When a field is left empty (e.g., 'sub_product_category' or 'strain_name'), simply leave it empty without placing an additional comma. Each row in the output CSV should always have only three commas separating the four fields regardless of whether some fields are empty. For instance, if 'sub_product_category' and 'strain_name' are empty, a row would look like this: "brand,product_category,,"
47
+ - DO NOT EXPLAIN YOURSELF, ONLY RETURN A CSV WITH THESE COLUMNS: {', '.join(output_cols)}
48
+
49
+ Input data set in CSV format:
50
+
51
+ '''
52
+ df = pd.DataFrame({'input__product_name':input_product_names,
53
+ 'input__brand':input_brands,
54
+ 'input__product_category':input_product_categories}).reset_index(drop=True)
55
+ # remove commas from all strings
56
+ df2 = df.copy()
57
+ for col in df2.columns:
58
+ df2[col] = df2[col].str.replace(',', '')
59
+
60
+ # send to LLM
61
+ p2 = df2.to_csv(index=False, quoting=csv.QUOTE_ALL)
62
+ messages = [{'role':'system','content':'You are a helpful assistant. Return a properly-formatted CSV with the correct number of columns.'},
63
+ {'role':'user', 'content':p1+p2+'\n\nOutput CSV with header row:\n\n'}
64
+ ]
65
+ comp = run_with_timeout(openai.ChatCompletion.create,
66
+ model='gpt-4',
67
+ messages=messages,
68
+ max_tokens=2000,
69
+ timeout=300,
70
+ temperature=0.2
71
+ )
72
+ res = comp['choices'][0]['message']['content']
73
+
74
+ # remove rows with wrong number of columns
75
+ keeprows = []
76
+ for i,s in enumerate(res.split('\n')):
77
+ if i==0:
78
+ keeprows.append(s)
79
+ continue
80
+ _ncols = len(s.split(','))
81
+ if _ncols!=ncols:
82
+ print(f'Got {_ncols} columns, skipping row {i-1} ({s})')
83
+ df = df.drop(i-1)
84
+ else:
85
+ keeprows.append(s)
86
+ df = df.reset_index(drop=True)
87
+
88
+ resdf = pd.read_csv(StringIO('\n'.join(keeprows)))
89
+
90
+ assert len(df)==len(resdf), 'Result CSV did not match input CSV in length'
91
+ df = pd.concat([df.reset_index(drop=True),resdf.reset_index(drop=True)],axis=1)
92
+ # check category/subcategory
93
+ dropidxs=[]
94
+ for idx, row in df.iterrows():
95
+ drop = False
96
+ if pd.isna(row['product_category']) and not pd.isna(row['sub_product_category']):
97
+ drop=True
98
+ print('product_category is null while sub_product_category is not null, dropping')
99
+ if not pd.isna(row['product_category']):
100
+ if row['product_category'] not in category_taxonomy.keys():
101
+ print(f'category "{row["product_category"]}" not in taxonomy, dropping row')
102
+ drop =True
103
+ elif not pd.isna(row['sub_product_category']):
104
+ if row['sub_product_category'] not in category_taxonomy[row['product_category']]:
105
+ print(f'subcategory "{row["sub_product_category"]}" not valid for category {row["product_category"]}, dropping row')
106
+ drop =True
107
+ if drop:
108
+ dropidxs.append(idx)
109
+ df = df.drop(dropidxs)
110
+
111
+ return df
112
+
113
+ def get_key(df):
114
+ return df['input__product_name'] + df['input__brand'] + df['input__product_category']
115
+
116
+ def main(upload_df: pd.DataFrame,
117
+ chunksize: int = 30,
118
+ ):
119
+ category_taxonomy = {
120
+ "Wellness": ["Mushroom Caps", "CBD Tincture/Caps/etc", "Promo/ Sample", "Capsule", "Liquid Flower", ""],
121
+ "Concentrate": ["Diamonds", "Shatter", "Sugar", "Promo/ Sample", "Badder", "Diamonds and Sauce", "Rosin", "Cookies Dough", "Flan", "Cookie Dough", ""],
122
+ "Preroll": ["Cubano", "Joint", "Promo/ Sample", "Blunt", "Infused Joint", "Packwoods Blunt", "Infused Blunt", "Napalm", ""],
123
+ "Vape": ["Terp Sauce", "Gpen 0.5", "Cured Resin", "Solventless Rosin", "510", "Dry Flower Series", "Natural Terp Series", "Promo/ Sample", "Dart Pod 0.5", "Raw Garden", "Live Flower Series", "Rosin", "Disposable", ""],
124
+ "Edible": ["Cookies", "Gummies", "Mint", "Promo/ Sample", "Beverage", "Chocolate", ""],
125
+ "Grow Products": ["Promo/ Sample", ""],
126
+ "Flower": ["Promo/ Sample", "Bud", ""],
127
+ "Accessory": ["Promo/ Sample", ""]
128
+ }
129
+
130
+ # join together and get the diff
131
+ upload_df['key'] = get_key(upload_df)
132
+ upload_df=upload_df.set_index('key')
133
+ if output_df is None:
134
+ rundf = upload_df
135
+ outlen = 0
136
+ else:
137
+ output_df['key'] = get_key(output_df)
138
+ output_df=output_df.set_index('key')
139
+ rundf = upload_df.loc[~upload_df.index.isin(output_df.index)]
140
+ outlen = len(output_df)
141
+
142
+
143
+ # st.write(f'Input size: {len(upload_df)}, Output size: {outlen}, Still to process: {len(rundf)}')
144
+
145
+ for _, chunk in tqdm(rundf.groupby(np.arange(len(rundf)) // chunksize)):
146
+ result = clean_data(chunk['input__product_name'], chunk['input__brand'], chunk['input__product_category'], category_taxonomy)
147
+ result['key'] = get_key(result)
148
+ result = result.set_index('key')
149
+ output_df = result if output_df is None else pd.concat([output_df, result])
150
+
151
+ return output_df