cmagganas commited on
Commit
73588d1
1 Parent(s): db2ee0f

Upload folder using huggingface_hub

Browse files
app/__pycache__/util.cpython-310.pyc ADDED
Binary file (2.16 kB). View file
 
app/app.py CHANGED
@@ -1,8 +1,9 @@
1
- # this app is streamlit app for the current project hosted on huggingface spaces
2
 
3
  import streamlit as st
4
  from openai_chat_completion import OpenAIChatCompletions
5
  from dataclean_hf import main
 
6
 
7
  st.title("Kaleidoscope Data - Data Cleaning LLM App")
8
 
@@ -18,17 +19,19 @@ if st.button("Run Data Cleaning API"):
18
  # if text_input is not empty, run data cleaning API on text_input
19
  if text_input:
20
 
21
- model = "gpt-4" # "gpt-3.5-turbo"
22
- sys_mes = "prompts/gpt4-system-message.txt"
23
 
24
  # instantiate OpenAIChatCompletions class
25
  # get response from openai_chat_completion method
26
- chat = OpenAIChatCompletions(model=model, system_message=sys_mes)
27
  response = chat.openai_chat_completion(text_input, n_shot=None)
28
 
29
 
30
  # display response
31
- st.write(response['choices'][0]['message']['content'])
 
 
32
 
33
  # if csv_file is not empty, run data cleaning API on csv_file
34
  elif csv_file:
@@ -38,6 +41,14 @@ if st.button("Run Data Cleaning API"):
38
 
39
  @st.cache_data
40
  def convert_df(df):
 
 
 
 
 
 
 
 
41
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
42
  return df.to_csv().encode('utf-8')
43
 
 
1
+ """ this app is streamlit app for the current project hosted on HuggingFace spaces """
2
 
3
  import streamlit as st
4
  from openai_chat_completion import OpenAIChatCompletions
5
  from dataclean_hf import main
6
+ from util import json_to_dict #, join_dicts
7
 
8
  st.title("Kaleidoscope Data - Data Cleaning LLM App")
9
 
 
19
  # if text_input is not empty, run data cleaning API on text_input
20
  if text_input:
21
 
22
+ MODEL = "gpt-4" # "gpt-3.5-turbo"
23
+ sys_mes = open('../prompts/gpt4-system-message2.txt', 'r').read()
24
 
25
  # instantiate OpenAIChatCompletions class
26
  # get response from openai_chat_completion method
27
+ chat = OpenAIChatCompletions(model=MODEL, system_message=sys_mes)
28
  response = chat.openai_chat_completion(text_input, n_shot=None)
29
 
30
 
31
  # display response
32
+ # st.write(response['choices'][0]['message']['content'])
33
+ response_content = response['choices'][0]['message']['content']
34
+ st.write(json_to_dict(response_content))
35
 
36
  # if csv_file is not empty, run data cleaning API on csv_file
37
  elif csv_file:
 
41
 
42
  @st.cache_data
43
  def convert_df(df):
44
+ """coverting dataframe to csv
45
+
46
+ Args:
47
+ df (_type_): pd.DataFrame
48
+
49
+ Returns:
50
+ _type_: csv
51
+ """
52
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
53
  return df.to_csv().encode('utf-8')
54
 
app/output_format.ipynb ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file\n",
10
+ "from openai_chat_completion import OpenAIChatCompletions"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 4,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "import os\n",
20
+ "from dotenv import load_dotenv\n",
21
+ "load_dotenv()\n",
22
+ "\n",
23
+ "import openai\n",
24
+ "\n",
25
+ "# set OPENAI_API_KEY environment variable from .env file\n",
26
+ "openai.api_key = os.getenv(\"OPENAI_API_KEY\")"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 5,
32
+ "metadata": {},
33
+ "outputs": [
34
+ {
35
+ "data": {
36
+ "text/plain": [
37
+ "'I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\\n\\n - Brand (brand)\\n - product category (product_category)\\n - sub product category (sub_product_category)\\n - strain name (strain_name)\\n\\nAdditional requirements:\\n\\n- DO NOT EXPLAIN YOUR SELF \\n\\nProduct data below '"
38
+ ]
39
+ },
40
+ "execution_count": 5,
41
+ "metadata": {},
42
+ "output_type": "execute_result"
43
+ }
44
+ ],
45
+ "source": [
46
+ "system_message = open('../prompts/gpt4-system-message.txt', 'r').read()\n",
47
+ "system_message"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 6,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\n",
60
+ "\n",
61
+ " - Brand (brand)\n",
62
+ " - product category (product_category)\n",
63
+ " - sub product category (sub_product_category)\n",
64
+ " - strain name (strain_name)\n",
65
+ "\n",
66
+ "Additional requirements:\n",
67
+ "\n",
68
+ "- DO NOT EXPLAIN YOUR SELF \n",
69
+ "\n",
70
+ "Product data below \n"
71
+ ]
72
+ }
73
+ ],
74
+ "source": [
75
+ "print(system_message)"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 7,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "chatInstance = OpenAIChatCompletions(system_message=system_message)\n",
85
+ "chat_response = chatInstance.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 8,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "- Brand: Cookies\n",
98
+ "- Product Category: Edibles\n",
99
+ "- Sub Product Category: Gummy\n",
100
+ "- Strain Name: London Pound Cake 75\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "print(chat_response['choices'][0]['message']['content'])"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 9,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "system_message2 = \"\"\"\n",
115
+ "I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\n",
116
+ "\n",
117
+ " - Brand (brand)\n",
118
+ " - product category (product_category)\n",
119
+ " - sub product category (sub_product_category)\n",
120
+ " - strain name (strain_name)\n",
121
+ "\n",
122
+ "Additional requirements:\n",
123
+ "\n",
124
+ "DO NOT EXPLAIN YOUR SELF \n",
125
+ "Format output in JSON format\n",
126
+ "\n",
127
+ "example output:\n",
128
+ "{\"col1\": \"value1\", \"col2\": \"value2\", \"col3\": \"value3\"}\n",
129
+ "\n",
130
+ "---\n",
131
+ "\n",
132
+ "Product data below \n",
133
+ "\"\"\""
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "name": "stdout",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "{\"brand\": \"Cookies\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Gummy\", \"strain_name\": \"London Pound Cake 75\"}\n"
146
+ ]
147
+ }
148
+ ],
149
+ "source": [
150
+ "chatInstance2 = OpenAIChatCompletions(system_message=system_message2)\n",
151
+ "chat_response2 = chatInstance2.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")\n",
152
+ "print(chat_response2['choices'][0]['message']['content'])"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 11,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "chat_response2_content = chat_response2['choices'][0]['message']['content']"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 12,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "data": {
171
+ "text/plain": [
172
+ "{'brand': 'Cookies',\n",
173
+ " 'product_category': 'Edibles',\n",
174
+ " 'sub_product_category': 'Gummy',\n",
175
+ " 'strain_name': 'LondonPoundCake75'}"
176
+ ]
177
+ },
178
+ "execution_count": 12,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "# write function that takes string in the form of json and returns a dictionary\n",
185
+ "\n",
186
+ "def json_to_dict(json_string):\n",
187
+ " json_string = json_string.replace('\\n', '')\n",
188
+ " json_string = json_string.replace('\\t', '')\n",
189
+ " json_string = json_string.replace(' ', '')\n",
190
+ " json_string = json_string.replace('\"', '')\n",
191
+ " json_string = json_string.replace('{', '')\n",
192
+ " json_string = json_string.replace('}', '')\n",
193
+ " json_string = json_string.replace(':', ',')\n",
194
+ " json_string = json_string.split(',')\n",
195
+ " return {\n",
196
+ " json_string[i]: json_string[i + 1]\n",
197
+ " for i in range(0, len(json_string), 2)\n",
198
+ " }\n",
199
+ "\n",
200
+ "output_as_json = json_to_dict(chat_response2_content)\n",
201
+ "assert type(output_as_json) == dict\n",
202
+ "output_as_json"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 13,
208
+ "metadata": {},
209
+ "outputs": [
210
+ {
211
+ "data": {
212
+ "text/html": [
213
+ "<div>\n",
214
+ "<style scoped>\n",
215
+ " .dataframe tbody tr th:only-of-type {\n",
216
+ " vertical-align: middle;\n",
217
+ " }\n",
218
+ "\n",
219
+ " .dataframe tbody tr th {\n",
220
+ " vertical-align: top;\n",
221
+ " }\n",
222
+ "\n",
223
+ " .dataframe thead th {\n",
224
+ " text-align: right;\n",
225
+ " }\n",
226
+ "</style>\n",
227
+ "<table border=\"1\" class=\"dataframe\">\n",
228
+ " <thead>\n",
229
+ " <tr style=\"text-align: right;\">\n",
230
+ " <th></th>\n",
231
+ " <th>brand</th>\n",
232
+ " <th>product_category</th>\n",
233
+ " <th>sub_product_category</th>\n",
234
+ " <th>strain_name</th>\n",
235
+ " </tr>\n",
236
+ " </thead>\n",
237
+ " <tbody>\n",
238
+ " <tr>\n",
239
+ " <th>0</th>\n",
240
+ " <td>Cookies</td>\n",
241
+ " <td>Edibles</td>\n",
242
+ " <td>Gummy</td>\n",
243
+ " <td>LondonPoundCake75</td>\n",
244
+ " </tr>\n",
245
+ " </tbody>\n",
246
+ "</table>\n",
247
+ "</div>"
248
+ ],
249
+ "text/plain": [
250
+ " brand product_category sub_product_category strain_name\n",
251
+ "0 Cookies Edibles Gummy LondonPoundCake75"
252
+ ]
253
+ },
254
+ "execution_count": 13,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "# write a function that takes a dictionary and returns a dataframe\n",
261
+ "import pandas as pd\n",
262
+ "\n",
263
+ "def dict_to_df(dictionary):\n",
264
+ " return pd.DataFrame(dictionary, index=[0])\n",
265
+ "\n",
266
+ "dict_to_df(output_as_json)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": 14,
272
+ "metadata": {},
273
+ "outputs": [
274
+ {
275
+ "name": "stdout",
276
+ "output_type": "stream",
277
+ "text": [
278
+ "{\"brand\": \"Cookies\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Gummy\", \"strain_name\": \"London Pound Cake 75\"}\n",
279
+ "{\"brand\": \"Berlin\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Brownies\", \"strain_name\": \"Chocolate Hazelnut 69\"}\n"
280
+ ]
281
+ }
282
+ ],
283
+ "source": [
284
+ "chat_response2a = chatInstance2.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")\n",
285
+ "chat_response2b = chatInstance2.openai_chat_completion(prompt=\"Brownies - Berlin Chocolate Hazelnut 69 - Flower - 1ct - 69mg\")\n",
286
+ "print(chat_response2a['choices'][0]['message']['content'])\n",
287
+ "print(chat_response2b['choices'][0]['message']['content'])"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 15,
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": [
296
+ "def join_dicts(dict1, dict2):\n",
297
+ " return {key:[dict1[key], dict2[key]] for key in dict1}"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": 16,
303
+ "metadata": {},
304
+ "outputs": [
305
+ {
306
+ "data": {
307
+ "text/plain": [
308
+ "{'brand': ['Cookies', 'Berlin'],\n",
309
+ " 'product_category': ['Edibles', 'Edibles'],\n",
310
+ " 'sub_product_category': ['Gummy', 'Brownies'],\n",
311
+ " 'strain_name': ['LondonPoundCake75', 'ChocolateHazelnut69']}"
312
+ ]
313
+ },
314
+ "execution_count": 16,
315
+ "metadata": {},
316
+ "output_type": "execute_result"
317
+ }
318
+ ],
319
+ "source": [
320
+ "out2a_as_json = json_to_dict(chat_response2a['choices'][0]['message']['content'])\n",
321
+ "out2b_as_json = json_to_dict(chat_response2b['choices'][0]['message']['content'])\n",
322
+ "\n",
323
+ "out3_as_json = join_dicts(out2a_as_json, out2b_as_json)\n",
324
+ "out3_as_json"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "metadata": {},
330
+ "source": [
331
+ "Try via util.py File"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": 18,
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": [
340
+ "from util import json_to_dict, join_dicts"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 19,
346
+ "metadata": {},
347
+ "outputs": [
348
+ {
349
+ "data": {
350
+ "text/plain": [
351
+ "{'brand': ['Cookies', 'Berlin'],\n",
352
+ " 'product_category': ['Edibles', 'Edibles'],\n",
353
+ " 'sub_product_category': ['Gummy', 'Brownies'],\n",
354
+ " 'strain_name': ['LondonPoundCake75', 'ChocolateHazelnut69']}"
355
+ ]
356
+ },
357
+ "execution_count": 19,
358
+ "metadata": {},
359
+ "output_type": "execute_result"
360
+ }
361
+ ],
362
+ "source": [
363
+ "out2a_as_json = json_to_dict(chat_response2a['choices'][0]['message']['content'])\n",
364
+ "out2b_as_json = json_to_dict(chat_response2b['choices'][0]['message']['content'])\n",
365
+ "\n",
366
+ "out3_as_json = join_dicts(out2a_as_json, out2b_as_json)\n",
367
+ "out3_as_json"
368
+ ]
369
+ }
370
+ ],
371
+ "metadata": {
372
+ "kernelspec": {
373
+ "display_name": "kd-llm-dc",
374
+ "language": "python",
375
+ "name": "python3"
376
+ },
377
+ "language_info": {
378
+ "codemirror_mode": {
379
+ "name": "ipython",
380
+ "version": 3
381
+ },
382
+ "file_extension": ".py",
383
+ "mimetype": "text/x-python",
384
+ "name": "python",
385
+ "nbconvert_exporter": "python",
386
+ "pygments_lexer": "ipython3",
387
+ "version": "3.10.11"
388
+ },
389
+ "orig_nbformat": 4
390
+ },
391
+ "nbformat": 4,
392
+ "nbformat_minor": 2
393
+ }
app/util.py CHANGED
@@ -1,5 +1,17 @@
1
- # write a function that compares the completion and prediction, separating each string by comma into their respective columns, then compare each column and return a dataframe with the results
2
  def compare_completion_and_prediction(completion, prediction, verbose=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  # if verbose is True, print the completion and prediction strings
4
  if verbose:
5
  print("Completion:", completion, f"type({type(completion)}):")
@@ -7,14 +19,38 @@ def compare_completion_and_prediction(completion, prediction, verbose=False):
7
  # split completion and prediction strings on comma character
8
  completion = completion.split(',')
9
  prediction = prediction.split(',')
10
- # create a column that counts the number of matchs between completion and prediction
11
  matches = [completion[i] == prediction[i] for i in range(len(completion))]
12
- # create a json dictionary with the completion, prediction, matches, and num_correct fields
13
- json_dict = {
14
  "completion": completion,
15
  "prediction": prediction,
16
  "matches": matches,
17
- "num_correct": sum(matches)
18
  }
19
- # return the json dictionary
20
- return json_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def compare_completion_and_prediction(completion, prediction, verbose=False):
2
+ """
3
+ a function that compares the completion and prediction
4
+ separating each string by comma into their respective columns,
5
+ then compare each column and return a DataFrame with the results
6
+
7
+ Args:
8
+ completion (_type_): str
9
+ prediction (_type_): str
10
+ verbose (bool, optional): bool. Defaults to False.
11
+
12
+ Returns:
13
+ _type_: json object with completion, prediction, matches, and num_correct
14
+ """
15
  # if verbose is True, print the completion and prediction strings
16
  if verbose:
17
  print("Completion:", completion, f"type({type(completion)}):")
 
19
  # split completion and prediction strings on comma character
20
  completion = completion.split(',')
21
  prediction = prediction.split(',')
22
+ # create a column that counts the number of matches between completion and prediction
23
  matches = [completion[i] == prediction[i] for i in range(len(completion))]
24
+ return {
 
25
  "completion": completion,
26
  "prediction": prediction,
27
  "matches": matches,
28
+ "num_correct": sum(matches),
29
  }
30
+
31
+ def json_to_dict(json_string):
32
+ """function that takes string in the form of json and returns a dictionary"""
33
+ json_string = json_string.replace('\n', '')
34
+ json_string = json_string.replace('\t', '')
35
+ json_string = json_string.replace(' ', '')
36
+ json_string = json_string.replace('"', '')
37
+ json_string = json_string.replace('{', '')
38
+ json_string = json_string.replace('}', '')
39
+ json_string = json_string.replace(':', ',')
40
+ json_string = json_string.split(',')
41
+ return {
42
+ json_string[i]: json_string[i + 1]
43
+ for i in range(0, len(json_string), 2)
44
+ }
45
+
46
+ def join_dicts(dict1, dict2):
47
+ """function that joins two dictionaries into one dictionary
48
+
49
+ Args:
50
+ dict1 (_type_): dict
51
+ dict2 (_type_): dict
52
+
53
+ Returns:
54
+ _type_: dict
55
+ """
56
+ return {key:[dict1[key], dict2[key]] for key in dict1}
prompts/gpt4-system-message2.txt CHANGED
@@ -7,6 +7,12 @@ I am going to provide marijuana product information. Using the information I pro
7
 
8
  Additional requirements:
9
 
10
- - DO NOT EXPLAIN YOUR SELF
 
 
 
 
 
 
11
 
12
  Product data below
 
7
 
8
  Additional requirements:
9
 
10
+ DO NOT EXPLAIN YOUR SELF
11
+ Format output in JSON format
12
+
13
+ example output:
14
+ {"col1": "value1", "col2": "value2", "col3": "value3"}
15
+
16
+ ---
17
 
18
  Product data below