Spaces:

tomascufaro
/

keyword_classification

Sleeping

App Files Files Community

tomascufaro commited on Feb 19, 2024

Commit

b8ae7f5

1 Parent(s): ad82fc4

no categories support

Browse files

Files changed (1) hide show

app.py +60 -30

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from langchain.prompts import ChatPromptTemplate
 import gradio as gr
 from collections import defaultdict
 # Schema
 schema = {
     "properties": {
@@ -31,7 +32,7 @@ schema = {
 # Input
 prompt = ChatPromptTemplate.from_messages(
     [
-        ("system", "You are an expert marketing researcher specialized in the finance industry"),
         ("human", """{prompt_input}.
          Here you have the categories splitted by coma: {categories}.
          and Here you have the keywords splitted by coma: {keywords}."""),
@@ -39,10 +40,20 @@ prompt = ChatPromptTemplate.from_messages(
     ]
 )
-llm = ChatOpenAI(temperature=0, openai_api_key=os.environ['OpenAI_APIKEY'], model="gpt-3.5-turbo")
 chain = create_extraction_chain(schema, llm, prompt, verbose=1)
-def run_chain(input_prompt, keywords_file, categories_file, batch_size=50):
     results = []
     batch_size = batch_size
     index = 0
@@ -50,31 +61,51 @@ def run_chain(input_prompt, keywords_file, categories_file, batch_size=50):
         keywords = pd.read_csv(keywords_file.name)
     except:
         keywords = pd.read_excel(keywords_file.name)
-    try:
-        categories = pd.read_csv(categories_file.name)
-    except:
-        categories = pd.read_excel(categories_file.name)
-    keywords = list(keywords[keywords.columns[0]].values)
-    categories = list(categories[categories.columns[0]].values)
-    while index < len(keywords):
         try:
-            batch = keywords[index:index+batch_size]
         except:
-            batch = keywords[index:]
-        try:
-            result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
-        except Exception as E:
-            print('this batch did not worked from {} to {}'.format(index, index + batch_size))
-            print(E)
-            result = []
-        results += result
-        index += batch_size
-        results_to_csv(results)
-        #print((index, batch_size, len(keywords)))
-    return results, 'themes_results.csv'
 def results_to_csv(results):
-    super_dict = defaultdict(list)
     for d in results:
         for k, v in d.items():  # d.items() in Python 3+
             super_dict[k].append(v)
@@ -85,20 +116,19 @@ with gr.Blocks() as demo:
     prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
 into the appropriate categories.
 The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category.
-Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.""")
     gr.Markdown("Upload CSV or xlsx with keywords: Just a csv  with all the keywords in one column. Should have a header")
     keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
     gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
     categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
-    with gr.Accordion("Open for More!"):
-        gr.Markdown("Look at me...")
-    btn = gr.Button(value="run")
     txt_3 = gr.Textbox(value="", label="Output")
     output_file = gr.File(label="Output File",
                 file_count="single",
                 file_types=["", ".", ".csv",".xls",".xlsx"])
     btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])
 demo.launch()

 import gradio as gr
 from collections import defaultdict
+"""Core Modules"""
 # Schema
 schema = {
     "properties": {
 # Input
 prompt = ChatPromptTemplate.from_messages(
     [
+        ("system", "You are an expert marketing researcher"),
         ("human", """{prompt_input}.
          Here you have the categories splitted by coma: {categories}.
          and Here you have the keywords splitted by coma: {keywords}."""),
     ]
 )
+prompt_no_cat = ChatPromptTemplate.from_messages(
+    [
+        ("system", "You are an expert marketing researcher"),
+        ("human", """{prompt_input}.
+         and Here you have the keywords splitted by coma: {keywords}."""),
+        ("human", "Tip: Make sure to answer in the correct format and DO NOT leave keywords without category and DO NOT skip keywords. Please categorize all the keywords that I give you, each keyword must have just one and only one category."),
+    ]
+)
+llm = ChatOpenAI(temperature=0, openai_api_key=APIkeys.OpenAI_APIKEY, model="gpt-3.5-turbo")
 chain = create_extraction_chain(schema, llm, prompt, verbose=1)
+chain_no_cat = create_extraction_chain(schema, llm, prompt_no_cat, verbose=1)
+def run_chain(input_prompt, keywords_file, categories_file=None, batch_size=50):
     results = []
     batch_size = batch_size
     index = 0
         keywords = pd.read_csv(keywords_file.name)
     except:
         keywords = pd.read_excel(keywords_file.name)
+    if categories_file != None:
         try:
+            categories = pd.read_csv(categories_file.name)
         except:
+            categories = pd.read_excel(categories_file.name)
+        categories = list(categories[categories.columns[0]].values)
+        keywords = list(keywords[keywords.columns[0]].values)
+        while index < len(keywords):
+            try:
+                batch = keywords[index:index+batch_size]
+            except:
+                batch = keywords[index:]
+            try:
+                result = chain.run({'prompt_input':input_prompt, 'categories':','.join(categories), 'keywords':','.join(batch)})
+            except Exception as E:
+                print('this batch did not worked from {} to {}'.format(index, index + batch_size))
+                print(E)
+                result = []
+            results += result
+            index += batch_size
+            results_to_csv(results)
+            #print((index, batch_size, len(keywords)))
+        return results, 'themes_results.csv'
+    else:
+        keywords = list(keywords[keywords.columns[0]].values)
+        batch_size = len(keywords)
+        while index < len(keywords):
+            try:
+                batch = keywords[index:index+batch_size]
+            except:
+                batch = keywords[index:]
+            try:
+                result = chain_no_cat.run({'prompt_input':input_prompt, 'keywords':','.join(batch)})
+            except Exception as E:
+                print('this batch did not worked from {} to {}'.format(index, index + batch_size))
+                print(E)
+                result = []
+            results += result
+            index += batch_size
+            results_to_csv(results)
+            #print((index, batch_size, len(keywords)))
+        return results, 'themes_results.csv'
 def results_to_csv(results):
+    super_dict = collections.defaultdict(list)
     for d in results:
         for k, v in d.items():  # d.items() in Python 3+
             super_dict[k].append(v)
     prompt_input = gr.Text("""I need your help to analyze and categorize the provided list of keywords
 into the appropriate categories.
 The goal is to understand information demand on search engines within this industry. Each keyword represents a search and it should have a relation with the category.
+Extract each keyword and assign the best category among the given categories. Return every keyword with the relative category in pairs.
+If the categories are not given """)
     gr.Markdown("Upload CSV or xlsx with keywords: Just a csv  with all the keywords in one column. Should have a header")
     keywords_file = gr.File(file_types=['csv', 'xlsx'], label='keywords')
     gr.Markdown("Upload CSV or xlsx with categories: Just a csv with all the keywords in one column. Should have a header")
     categories_file = gr.File(file_types=['.csv', '.xlsx'], label='categories')
+    btn = gr.Button(value="Run with categories")
+    btn2 = gr.Button(value="Run without categories")
     txt_3 = gr.Textbox(value="", label="Output")
     output_file = gr.File(label="Output File",
                 file_count="single",
                 file_types=["", ".", ".csv",".xls",".xlsx"])
     btn.click(run_chain, inputs=[prompt_input, keywords_file, categories_file], outputs=[txt_3, output_file])
+    btn2.click(run_chain, inputs=[prompt_input, keywords_file], outputs=[txt_3, output_file])
 demo.launch()