Spaces:

kaleidoscope-data
/

data-cleaning-llm

Runtime error

App Files Files Community

cmagganas commited on Jul 19, 2023

Commit

73588d1

•

1 Parent(s): db2ee0f

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

app/__pycache__/util.cpython-310.pyc +0 -0
app/app.py +16 -5
app/output_format.ipynb +393 -0
app/util.py +43 -7
prompts/gpt4-system-message2.txt +7 -1

app/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (2.16 kB). View file

app/app.py CHANGED Viewed

@@ -1,8 +1,9 @@
-# this app is streamlit app for the current project hosted on huggingface spaces
 import streamlit as st
 from openai_chat_completion import OpenAIChatCompletions
 from dataclean_hf import main
 st.title("Kaleidoscope Data - Data Cleaning LLM App")
@@ -18,17 +19,19 @@ if st.button("Run Data Cleaning API"):
     # if text_input is not empty, run data cleaning API on text_input
     if text_input:
-        model = "gpt-4" # "gpt-3.5-turbo"
-        sys_mes = "prompts/gpt4-system-message.txt"
         # instantiate OpenAIChatCompletions class
         # get response from openai_chat_completion method
-        chat = OpenAIChatCompletions(model=model, system_message=sys_mes)
         response = chat.openai_chat_completion(text_input, n_shot=None)
         # display response
-        st.write(response['choices'][0]['message']['content'])
     # if csv_file is not empty, run data cleaning API on csv_file
     elif csv_file:
@@ -38,6 +41,14 @@ if st.button("Run Data Cleaning API"):
         @st.cache_data
         def convert_df(df):
             # IMPORTANT: Cache the conversion to prevent computation on every rerun
             return df.to_csv().encode('utf-8')

+""" this app is streamlit app for the current project hosted on HuggingFace spaces """
 import streamlit as st
 from openai_chat_completion import OpenAIChatCompletions
 from dataclean_hf import main
+from util import json_to_dict #, join_dicts
 st.title("Kaleidoscope Data - Data Cleaning LLM App")
     # if text_input is not empty, run data cleaning API on text_input
     if text_input:
+        MODEL = "gpt-4" # "gpt-3.5-turbo"
+        sys_mes = open('../prompts/gpt4-system-message2.txt', 'r').read()
         # instantiate OpenAIChatCompletions class
         # get response from openai_chat_completion method
+        chat = OpenAIChatCompletions(model=MODEL, system_message=sys_mes)
         response = chat.openai_chat_completion(text_input, n_shot=None)
         # display response
+        # st.write(response['choices'][0]['message']['content'])
+        response_content = response['choices'][0]['message']['content']
+        st.write(json_to_dict(response_content))
     # if csv_file is not empty, run data cleaning API on csv_file
     elif csv_file:
         @st.cache_data
         def convert_df(df):
+            """coverting dataframe to csv
+            Args:
+                df (_type_): pd.DataFrame
+            Returns:
+                _type_: csv
+            """
             # IMPORTANT: Cache the conversion to prevent computation on every rerun
             return df.to_csv().encode('utf-8')

app/output_format.ipynb ADDED Viewed

	@@ -0,0 +1,393 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import OpenAIChatCompletions class from openai_chat_completion.py file and compare_completion_and_prediction function from util.py file\n",
+    "from openai_chat_completion import OpenAIChatCompletions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "\n",
+    "import openai\n",
+    "\n",
+    "# set OPENAI_API_KEY environment variable from .env file\n",
+    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\\n\\n    - Brand (brand)\\n    - product category (product_category)\\n    - sub product category (sub_product_category)\\n    - strain name (strain_name)\\n\\nAdditional requirements:\\n\\n- DO NOT EXPLAIN YOUR SELF \\n\\nProduct data below '"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "system_message = open('../prompts/gpt4-system-message.txt', 'r').read()\n",
+    "system_message"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\n",
+      "\n",
+      "    - Brand (brand)\n",
+      "    - product category (product_category)\n",
+      "    - sub product category (sub_product_category)\n",
+      "    - strain name (strain_name)\n",
+      "\n",
+      "Additional requirements:\n",
+      "\n",
+      "- DO NOT EXPLAIN YOUR SELF \n",
+      "\n",
+      "Product data below \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(system_message)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chatInstance = OpenAIChatCompletions(system_message=system_message)\n",
+    "chat_response = chatInstance.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "- Brand: Cookies\n",
+      "- Product Category: Edibles\n",
+      "- Sub Product Category: Gummy\n",
+      "- Strain Name: London Pound Cake 75\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(chat_response['choices'][0]['message']['content'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system_message2 = \"\"\"\n",
+    "I am going to provide marijuana product information. Using the information I provide, I want you to provide me with the following information about the product.\n",
+    "\n",
+    "    - Brand (brand)\n",
+    "    - product category (product_category)\n",
+    "    - sub product category (sub_product_category)\n",
+    "    - strain name (strain_name)\n",
+    "\n",
+    "Additional requirements:\n",
+    "\n",
+    "DO NOT EXPLAIN YOUR SELF \n",
+    "Format output in JSON format\n",
+    "\n",
+    "example output:\n",
+    "{\"col1\": \"value1\", \"col2\": \"value2\", \"col3\": \"value3\"}\n",
+    "\n",
+    "---\n",
+    "\n",
+    "Product data below \n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"brand\": \"Cookies\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Gummy\", \"strain_name\": \"London Pound Cake 75\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "chatInstance2 = OpenAIChatCompletions(system_message=system_message2)\n",
+    "chat_response2 = chatInstance2.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")\n",
+    "print(chat_response2['choices'][0]['message']['content'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_response2_content = chat_response2['choices'][0]['message']['content']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'brand': 'Cookies',\n",
+       " 'product_category': 'Edibles',\n",
+       " 'sub_product_category': 'Gummy',\n",
+       " 'strain_name': 'LondonPoundCake75'}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# write function that takes string in the form of json and returns a dictionary\n",
+    "\n",
+    "def json_to_dict(json_string):\n",
+    "    json_string = json_string.replace('\\n', '')\n",
+    "    json_string = json_string.replace('\\t', '')\n",
+    "    json_string = json_string.replace(' ', '')\n",
+    "    json_string = json_string.replace('\"', '')\n",
+    "    json_string = json_string.replace('{', '')\n",
+    "    json_string = json_string.replace('}', '')\n",
+    "    json_string = json_string.replace(':', ',')\n",
+    "    json_string = json_string.split(',')\n",
+    "    return {\n",
+    "        json_string[i]: json_string[i + 1]\n",
+    "        for i in range(0, len(json_string), 2)\n",
+    "    }\n",
+    "\n",
+    "output_as_json = json_to_dict(chat_response2_content)\n",
+    "assert type(output_as_json) == dict\n",
+    "output_as_json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>brand</th>\n",
+       "      <th>product_category</th>\n",
+       "      <th>sub_product_category</th>\n",
+       "      <th>strain_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Cookies</td>\n",
+       "      <td>Edibles</td>\n",
+       "      <td>Gummy</td>\n",
+       "      <td>LondonPoundCake75</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     brand product_category sub_product_category        strain_name\n",
+       "0  Cookies          Edibles                Gummy  LondonPoundCake75"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# write a function that takes a dictionary and returns a dataframe\n",
+    "import pandas as pd\n",
+    "\n",
+    "def dict_to_df(dictionary):\n",
+    "    return pd.DataFrame(dictionary, index=[0])\n",
+    "\n",
+    "dict_to_df(output_as_json)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\"brand\": \"Cookies\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Gummy\", \"strain_name\": \"London Pound Cake 75\"}\n",
+      "{\"brand\": \"Berlin\", \"product_category\": \"Edibles\", \"sub_product_category\": \"Brownies\", \"strain_name\": \"Chocolate Hazelnut 69\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "chat_response2a = chatInstance2.openai_chat_completion(prompt=\"Cookies - London Pound Cake 75 - Gummy - 10ct - 100mg\")\n",
+    "chat_response2b = chatInstance2.openai_chat_completion(prompt=\"Brownies - Berlin Chocolate Hazelnut 69 - Flower - 1ct - 69mg\")\n",
+    "print(chat_response2a['choices'][0]['message']['content'])\n",
+    "print(chat_response2b['choices'][0]['message']['content'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def join_dicts(dict1, dict2):\n",
+    "    return {key:[dict1[key], dict2[key]] for key in dict1}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'brand': ['Cookies', 'Berlin'],\n",
+       " 'product_category': ['Edibles', 'Edibles'],\n",
+       " 'sub_product_category': ['Gummy', 'Brownies'],\n",
+       " 'strain_name': ['LondonPoundCake75', 'ChocolateHazelnut69']}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out2a_as_json = json_to_dict(chat_response2a['choices'][0]['message']['content'])\n",
+    "out2b_as_json = json_to_dict(chat_response2b['choices'][0]['message']['content'])\n",
+    "\n",
+    "out3_as_json = join_dicts(out2a_as_json, out2b_as_json)\n",
+    "out3_as_json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Try via util.py File"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from util import json_to_dict, join_dicts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'brand': ['Cookies', 'Berlin'],\n",
+       " 'product_category': ['Edibles', 'Edibles'],\n",
+       " 'sub_product_category': ['Gummy', 'Brownies'],\n",
+       " 'strain_name': ['LondonPoundCake75', 'ChocolateHazelnut69']}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out2a_as_json = json_to_dict(chat_response2a['choices'][0]['message']['content'])\n",
+    "out2b_as_json = json_to_dict(chat_response2b['choices'][0]['message']['content'])\n",
+    "\n",
+    "out3_as_json = join_dicts(out2a_as_json, out2b_as_json)\n",
+    "out3_as_json"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "kd-llm-dc",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app/util.py CHANGED Viewed

@@ -1,5 +1,17 @@
-# write a function that compares the completion and prediction, separating each string by comma into their respective columns, then compare each column and return a dataframe with the results
 def compare_completion_and_prediction(completion, prediction, verbose=False):
     # if verbose is True, print the completion and prediction strings
     if verbose:
         print("Completion:", completion, f"type({type(completion)}):")
@@ -7,14 +19,38 @@ def compare_completion_and_prediction(completion, prediction, verbose=False):
     # split completion and prediction strings on comma character
     completion = completion.split(',')
     prediction = prediction.split(',')
-    # create a column that counts the number of matchs between completion and prediction
     matches = [completion[i] == prediction[i] for i in range(len(completion))]
-    # create a json dictionary with the completion, prediction, matches, and num_correct fields
-    json_dict = {
         "completion": completion,
         "prediction": prediction,
         "matches": matches,
-        "num_correct": sum(matches)
     }
-    # return the json dictionary
-    return json_dict

 def compare_completion_and_prediction(completion, prediction, verbose=False):
+    """
+    a function that compares the completion and prediction
+    separating each string by comma into their respective columns,
+    then compare each column and return a DataFrame with the results
+    Args:
+        completion (_type_): str
+        prediction (_type_): str
+        verbose (bool, optional): bool. Defaults to False.
+    Returns:
+        _type_: json object with completion, prediction, matches, and num_correct
+    """
     # if verbose is True, print the completion and prediction strings
     if verbose:
         print("Completion:", completion, f"type({type(completion)}):")
     # split completion and prediction strings on comma character
     completion = completion.split(',')
     prediction = prediction.split(',')
+    # create a column that counts the number of matches between completion and prediction
     matches = [completion[i] == prediction[i] for i in range(len(completion))]
+    return {
         "completion": completion,
         "prediction": prediction,
         "matches": matches,
+        "num_correct": sum(matches),
     }
+def json_to_dict(json_string):
+    """function that takes string in the form of json and returns a dictionary"""
+    json_string = json_string.replace('\n', '')
+    json_string = json_string.replace('\t', '')
+    json_string = json_string.replace(' ', '')
+    json_string = json_string.replace('"', '')
+    json_string = json_string.replace('{', '')
+    json_string = json_string.replace('}', '')
+    json_string = json_string.replace(':', ',')
+    json_string = json_string.split(',')
+    return {
+        json_string[i]: json_string[i + 1]
+        for i in range(0, len(json_string), 2)
+    }
+def join_dicts(dict1, dict2):
+    """function that joins two dictionaries into one dictionary
+    Args:
+        dict1 (_type_): dict
+        dict2 (_type_): dict
+    Returns:
+        _type_: dict
+    """
+    return {key:[dict1[key], dict2[key]] for key in dict1}

prompts/gpt4-system-message2.txt CHANGED Viewed

@@ -7,6 +7,12 @@ I am going to provide marijuana product information. Using the information I pro
 Additional requirements:
-- DO NOT EXPLAIN YOUR SELF
 Product data below

 Additional requirements:
+DO NOT EXPLAIN YOUR SELF
+Format output in JSON format
+example output:
+{"col1": "value1", "col2": "value2", "col3": "value3"}
+---
 Product data below