cmagganas commited on
Commit
d6674c7
1 Parent(s): 66602df

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -4,6 +4,5 @@ emoji: 🧹
4
  sdk: streamlit
5
  sdk_version: 1.24.0
6
  app_file: app/app.py
7
- # base_path: app/
8
  pinned: false
9
  ---
 
4
  sdk: streamlit
5
  sdk_version: 1.24.0
6
  app_file: app/app.py
 
7
  pinned: false
8
  ---
app/dataclean.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from io import StringIO
4
+ from tqdm import tqdm
5
+ import csv
6
+ from sqlalchemy import create_engine
7
+ from yallmf.utils import run_with_timeout
8
+ import pandas as pd
9
+ import numpy as np
10
+ import os
11
+ import openai
12
+
13
+ openai.api_key = os.getenv("OPENAI_API_KEY")
14
+
15
+ INPUT_FILE = os.path.expanduser('data/dataclean_input.csv')
16
+ OUTPUT_FILE = os.path.expanduser('data/dataclean_output.csv')
17
+ # OUTPUT_FILE = os.path.expanduser('~/data/aiclean/output.csv')
18
+ # CONFIGFILE = os.path.expanduser('~/config/cookies-dataclean.json')
19
+
20
+ def get_db_engine():
21
+ with open(CONFIGFILE) as f:
22
+ j = json.load(f)
23
+ dbconnstr=j['DB_CONN_STR']
24
+ return create_engine(dbconnstr,
25
+ executemany_mode='batch',
26
+ executemany_batch_page_size=1000)
27
+
28
+
29
+ def clean_data(
30
+ input_product_names: pd.Series,
31
+ input_brands: pd.Series,
32
+ input_product_categories: pd.Series,
33
+ category_taxonomy: dict):
34
+
35
+ output_cols = ['brand', 'product_category', 'sub_product_category', 'strain_name']
36
+ ncols = len(output_cols)
37
+
38
+ p1 = f'''
39
+ I am going to provide a data set of marijuana products and their metadata. Using the information I provide, I want you to provide me with the following information about the products.
40
+
41
+ - Brand (brand)
42
+ - product category (product_category)
43
+ - sub product category (sub_product_category)
44
+ - strain name (strain_name)
45
+
46
+ The following JSON shows all the acceptable Product Categories and their Sub Product Categories. Strictly adhere to the below mapping for valid product_category to sub_product_category relationships:
47
+
48
+ {json.dumps(category_taxonomy)}
49
+
50
+ Additional requirements:
51
+
52
+ - The input data set in CSV format, with commas as field delimiter and newline as row delimiter.
53
+ - Do not automatically assume that the information in the data set I provide is accurate.
54
+ - Leave the 'sub_product_category' field blank unless there's a clear and direct match with one of the categories provided in the list.If there is no explicit information to confidently assign a sub_product_category, default to leaving it blank.
55
+ - Strain names are only applicable for the following product categories: concentrate, preroll, vape, flower
56
+ - Look for clues in the product name to determine what brand/ product category/ sub product category/ and strain name the product should fall under. For Vape products, consider the words before 'Cartridge' or 'Cart' in the product name as potential strain names.
57
+ - Every row of the Output CSV must have EXACTLY {ncols} columns.
58
+ - When a field is left empty (e.g., 'sub_product_category' or 'strain_name'), simply leave it empty without placing an additional comma. Each row in the output CSV should always have only three commas separating the four fields regardless of whether some fields are empty. For instance, if 'sub_product_category' and 'strain_name' are empty, a row would look like this: "brand,product_category,,"
59
+ - DO NOT EXPLAIN YOURSELF, ONLY RETURN A CSV WITH THESE COLUMNS: {', '.join(output_cols)}
60
+
61
+ Input data set in CSV format:
62
+
63
+ '''
64
+ df = pd.DataFrame({'input__product_name':input_product_names,
65
+ 'input__brand':input_brands,
66
+ 'input__product_category':input_product_categories}).reset_index(drop=True)
67
+ # remove commas from all strings
68
+ df2 = df.copy()
69
+ for col in df2.columns:
70
+ df2[col] = df2[col].str.replace(',', '')
71
+
72
+ # send to LLM
73
+ p2 = df2.to_csv(index=False, quoting=csv.QUOTE_ALL)
74
+ messages = [{'role':'system','content':'You are a helpful assistant. Return a properly-formatted CSV with the correct number of columns.'},
75
+ {'role':'user', 'content':p1+p2+'\n\nOutput CSV with header row:\n\n'}
76
+ ]
77
+ comp = run_with_timeout(openai.ChatCompletion.create,
78
+ model='gpt-4',
79
+ messages=messages,
80
+ max_tokens=2000,
81
+ timeout=300,
82
+ temperature=0.2
83
+ )
84
+ res = comp['choices'][0]['message']['content']
85
+
86
+ # remove rows with wrong number of columns
87
+ keeprows = []
88
+ for i,s in enumerate(res.split('\n')):
89
+ if i==0:
90
+ keeprows.append(s)
91
+ continue
92
+ _ncols = len(s.split(','))
93
+ if _ncols!=ncols:
94
+ print(f'Got {_ncols} columns, skipping row {i-1} ({s})')
95
+ df = df.drop(i-1)
96
+ else:
97
+ keeprows.append(s)
98
+ df = df.reset_index(drop=True)
99
+
100
+ resdf = pd.read_csv(StringIO('\n'.join(keeprows)))
101
+
102
+ assert len(df)==len(resdf), 'Result CSV did not match input CSV in length'
103
+ df = pd.concat([df.reset_index(drop=True),resdf.reset_index(drop=True)],axis=1)
104
+ # check category/subcategory
105
+ dropidxs=[]
106
+ for idx, row in df.iterrows():
107
+ drop = False
108
+ if pd.isna(row['product_category']) and not pd.isna(row['sub_product_category']):
109
+ drop=True
110
+ print('product_category is null while sub_product_category is not null, dropping')
111
+ if not pd.isna(row['product_category']):
112
+ if row['product_category'] not in category_taxonomy.keys():
113
+ print(f'category "{row["product_category"]}" not in taxonomy, dropping row')
114
+ drop =True
115
+ elif not pd.isna(row['sub_product_category']):
116
+ if row['sub_product_category'] not in category_taxonomy[row['product_category']]:
117
+ print(f'subcategory "{row["sub_product_category"]}" not valid for category {row["product_category"]}, dropping row')
118
+ drop =True
119
+ if drop:
120
+ dropidxs.append(idx)
121
+ df = df.drop(dropidxs)
122
+
123
+ return df
124
+
125
+ def get_key(df):
126
+ return df['input__product_name']+df['input__brand']+df['input__product_category']
127
+
128
+ def main(input_file=INPUT_FILE, output_file=OUTPUT_FILE, chunksize=30):
129
+ category_taxonomy = {
130
+ "Wellness": ["Mushroom Caps", "CBD Tincture/Caps/etc", "Promo/ Sample", "Capsule", "Liquid Flower", ""],
131
+ "Concentrate": ["Diamonds", "Shatter", "Sugar", "Promo/ Sample", "Badder", "Diamonds and Sauce", "Rosin", "Cookies Dough", "Flan", "Cookie Dough", ""],
132
+ "Preroll": ["Cubano", "Joint", "Promo/ Sample", "Blunt", "Infused Joint", "Packwoods Blunt", "Infused Blunt", "Napalm", ""],
133
+ "Vape": ["Terp Sauce", "Gpen 0.5", "Cured Resin", "Solventless Rosin", "510", "Dry Flower Series", "Natural Terp Series", "Promo/ Sample", "Dart Pod 0.5", "Raw Garden", "Live Flower Series", "Rosin", "Disposable", ""],
134
+ "Edible": ["Cookies", "Gummies", "Mint", "Promo/ Sample", "Beverage", "Chocolate", ""],
135
+ "Grow Products": ["Promo/ Sample", ""],
136
+ "Flower": ["Promo/ Sample", "Bud", ""],
137
+ "Accessory": ["Promo/ Sample", ""]
138
+ }
139
+
140
+ # expects input__product_name, input__brand, input__product_category
141
+ dfin = pd.read_csv(input_file)
142
+ # expects same as above + output: brand, product_category, sub_product_category, strain_name
143
+ dfout = None
144
+ try:
145
+ dfout = pd.read_csv(output_file)
146
+ except FileNotFoundError:
147
+ pass
148
+
149
+ # join together and get the diff
150
+ dfin['key'] = get_key(dfin)
151
+ dfin=dfin.set_index('key')
152
+ if dfout is None:
153
+ rundf = dfin
154
+ outlen = 0
155
+ else:
156
+ dfout['key'] = get_key(dfout)
157
+ dfout=dfout.set_index('key')
158
+ rundf = dfin.loc[~dfin.index.isin(dfout.index)]
159
+ outlen = len(dfout)
160
+
161
+ print(f'''Input size {len(dfin)}, Output size {outlen}, still to process {len(rundf)}, chunksize {chunksize}. Processing...''')
162
+ for _, chunk in tqdm(rundf.groupby(np.arange(len(rundf)) // chunksize)):
163
+ result = clean_data(chunk['input__product_name'], chunk['input__brand'], chunk['input__product_category'], category_taxonomy)
164
+ result['key'] = get_key(result)
165
+ result = result.set_index('key')
166
+ if dfout is None:
167
+ dfout = result
168
+ else:
169
+ dfout = pd.concat([dfout,result])
170
+ dfout.to_csv(output_file, index=False)
171
+
172
+ if __name__=='__main__':
173
+ main()
app/openai_chat_completion.py CHANGED
@@ -42,7 +42,7 @@ class OpenAIChatCompletions:
42
 
43
 
44
  # function to use test data to predict completions
45
- def predict_jsonl(self, path_or_buf='data/cookies_test.jsonl', n_samples=None, n_shot=None):
46
  jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)
47
  if n_samples is not None:
48
  jsonObj = jsonObj.sample(n_samples, random_state=42)
@@ -71,6 +71,6 @@ class OpenAIChatCompletions:
71
 
72
  # a method that samples n rows from a jsonl file, returning a pandas dataframe
73
  @staticmethod
74
- def _sample_jsonl(path_or_buf='data/cookies_train.jsonl', n_samples=5):
75
  jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)
76
  return jsonObj.sample(n_samples, random_state=42)
 
42
 
43
 
44
  # function to use test data to predict completions
45
+ def predict_jsonl(self, path_or_buf='../data/cookies_test.jsonl', n_samples=None, n_shot=None):
46
  jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)
47
  if n_samples is not None:
48
  jsonObj = jsonObj.sample(n_samples, random_state=42)
 
71
 
72
  # a method that samples n rows from a jsonl file, returning a pandas dataframe
73
  @staticmethod
74
+ def _sample_jsonl(path_or_buf='../data/cookies_train.jsonl', n_samples=5):
75
  jsonObj = pd.read_json(path_or_buf=path_or_buf, lines=True)
76
  return jsonObj.sample(n_samples, random_state=42)
data/dataclean_input.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input__product_name,input__brand,input__product_category
2
+ Gary Payton,Cookies,PackedBud
3
+ RMG | Medical | Durban Poison,,BulkBud
4
+ Gary Payton #20 3.5g | Cookies,Cookies,PackedBud
5
+ London Poundcake #75 | Eighths,,PackedBud
6
+ JEF 3.5G,Dubz Garden X Official Gooniez,PackedBud
7
+ Gary Payton 3.5g |,Cookies,Flower 3.5g
8
+ Gelatti | Eighths,,PackedBud
9
+ SAT 3.5g,The Marathon Cultivation,PackedBud
10
+ Cookies - Apples & Bananas - Indoor - 3.5g,Cookies,Packed > Flower
11
+ Georgia Pie,Cookies,PackedBud
12
+ JEF 3.5G,Cookies,PackedBud
13
+ Cookies - Gary Payton - 3.5g,Cookies,"REC - Packed > REC, REC - Flower"
14
+ London Poundcake #75 3.5g | Cookies,Cookies,PackedBud
15
+ London Pound Cake #75 3.5g |,Cookies,Flower 3.5g
data/dataclean_output.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ input__product_name,input__brand,input__product_category,brand,product_category,sub_product_category,strain_name
2
+ Gary Payton,Cookies,PackedBud,Cookies,Flower,,Gary Payton
3
+ RMG | Medical | Durban Poison,,BulkBud,,Flower,,RMG | Medical | Durban Poison
4
+ Gary Payton #20 3.5g | Cookies,Cookies,PackedBud,Cookies,Flower,,Gary Payton #20
5
+ London Poundcake #75 | Eighths,,PackedBud,,Flower,,London Poundcake #75
6
+ JEF 3.5G,Dubz Garden X Official Gooniez,PackedBud,Dubz Garden X Official Gooniez,Flower,,JEF
7
+ Gary Payton 3.5g |,Cookies,Flower 3.5g,Cookies,Flower,,Gary Payton
8
+ Gelatti | Eighths,,PackedBud,,Flower,,Gelatti
9
+ SAT 3.5g,The Marathon Cultivation,PackedBud,The Marathon Cultivation,Flower,,SAT
10
+ Cookies - Apples & Bananas - Indoor - 3.5g,Cookies,Packed > Flower,Cookies,Flower,,Apples & Bananas
11
+ Georgia Pie,Cookies,PackedBud,Cookies,Flower,,Georgia Pie
12
+ JEF 3.5G,Cookies,PackedBud,Cookies,Flower,,JEF
13
+ Cookies - Gary Payton - 3.5g,Cookies,"REC - Packed > REC, REC - Flower",Cookies,Flower,,Gary Payton
14
+ London Poundcake #75 3.5g | Cookies,Cookies,PackedBud,Cookies,Flower,,London Poundcake #75
15
+ London Pound Cake #75 3.5g |,Cookies,Flower 3.5g,Cookies,Flower,,London Pound Cake #75
hf-space-upload.ipynb CHANGED
@@ -2,24 +2,16 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
- {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "/Users/christos/opt/miniconda3/envs/kd-llm-dc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
13
- " from .autonotebook import tqdm as notebook_tqdm\n"
14
- ]
15
- },
16
  {
17
  "data": {
18
  "text/plain": [
19
  "'https://huggingface.co/spaces/kaleidoscope-data/data-cleaning-llm/tree/main/'"
20
  ]
21
  },
22
- "execution_count": 1,
23
  "metadata": {},
24
  "output_type": "execute_result"
25
  }
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
  "metadata": {},
7
  "outputs": [
 
 
 
 
 
 
 
 
8
  {
9
  "data": {
10
  "text/plain": [
11
  "'https://huggingface.co/spaces/kaleidoscope-data/data-cleaning-llm/tree/main/'"
12
  ]
13
  },
14
+ "execution_count": 4,
15
  "metadata": {},
16
  "output_type": "execute_result"
17
  }
requirements.txt CHANGED
@@ -4,5 +4,8 @@ openai==0.27.8
4
  pandas==2.0.2
5
  python-dotenv==1.0.0
6
  scikit_learn==1.2.2
 
 
7
  tenacity==8.2.2
8
- streamlit==1.24.0
 
 
4
  pandas==2.0.2
5
  python-dotenv==1.0.0
6
  scikit_learn==1.2.2
7
+ SQLAlchemy==2.0.18
8
+ streamlit==1.24.0
9
  tenacity==8.2.2
10
+ tqdm==4.65.0
11
+ yallmf @ git+https://github.com/greendata-ai/yallmf.git