Spaces:

matthewfarant
/

fertilizer-catalog-engine

Runtime error

App Files Files Community

matthewfarant commited on Nov 6, 2023

Commit

d6df412

•

1 Parent(s): 95ecb1f

Add types

Browse files

Files changed (1) hide show

app.py +69 -39

app.py CHANGED Viewed

@@ -11,6 +11,9 @@ from simpletransformers.classification import ClassificationModel
 from transformers import pipeline
 import gradio as gr
 def is_nan(text):
     try:
         value = float(text)
@@ -43,73 +46,100 @@ def prepare_catalog():
     return combined_catalog
 # Your existing decision function
-def decision(user_input, catalog, product_name_catalog):
     # Initialize the model
-    pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN'))
     # Extract formula
     user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
     user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
-    # Similar Product
-    catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
-    catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
-    # Take Top Similar Product. Take "Product Catalog" first
-    catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
-    # Condition
-    if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
-        return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
-    elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
-        return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
-    elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
-        return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
-    elif catalog['Similarity Score'].values[0] < 80:
-        if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
-            return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
         else:
-            return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
-    else:
-        return "Product is not a Fertilizer"
-def app(input):
     catalog = prepare_catalog()
-    return decision(input, catalog, "Registered Product")
 # Initialize the app
 demo = gr.Interface(
     fn=app,
-    inputs="text",
     outputs="text",
-    examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
     title = 'Fertilizer Catalog Engine 🌽',
     description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
     article= """
     ### About The App
     This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map
     free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
     decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
     the catalog. <br>
     ### How Does it Work?
     This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
     When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be
     added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
     model will be able to learn how fertilizer products (especially the local ones) look like. <br>
     ### What are the Flags For?
-    The flag is a part of the "Active Transfer Learning" feature of this app. When a user flags an output as "Correct" or "Incorrect", the developer
-    will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
-    flagging the prediction result 🙏 <br>
     ### I want to test multiple inputs at once!
     You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
     programmatically. <br>
@@ -124,4 +154,4 @@ demo = gr.Interface(
 # Run the app
 if __name__ == "__main__":
-    demo.launch(show_api=True)

 from transformers import pipeline
 import gradio as gr
+# set current directory
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
 def is_nan(text):
     try:
         value = float(text)
     return combined_catalog
 # Your existing decision function
+def decision(user_input, type, catalog, product_name_catalog):
     # Initialize the model
+    pipe_detect = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = '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')
+    pipe_match = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-matching", token = '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')
     # Extract formula
     user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
     user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
+    if type == 'Fuzzy Search':
+        # Similar Product
+        catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
+        catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
+        # Take Top Similar Product. Take "Product Catalog" first
+        catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
+        # Condition
+        if catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
+            return f"[1] Product is Available in Catalog (SKU Registered as *{catalog['Registered Product'].values[0]}*)"
+        elif catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
+            return f"[2] Add as New Product (Registered in Kementan as *{catalog['Registered Product'].values[0]}*)"
+        elif catalog['Similarity Score'].values[0] >= 80 and catalog['Formula Similarity'].values[0] < 100:
+            return f"[3] Add as New Product (Similar to *{catalog['Registered Product'].values[0]}* in {catalog['Source'].values[0]} but with different formula)"
+        elif catalog['Similarity Score'].values[0] < 80:
+            if pipe_detect(user_input)[0]['label'] == 'Fertilizer' and pipe_detect(user_input)[0]['score'] > 0.8:
+                return f"[4] Add as New Product ({pipe_detect(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
+            else:
+                return f"[5] Product might not be a Fertilizer ({np.round(pipe_detect(user_input)[0]['score'] * 100,2)}% probability of being a {pipe_detect(user_input)[0]['label']})"
         else:
+            return "[6] Product is not a Fertilizer"
+    elif type == 'Training Mode':
+        # Same like above, but only match with catalog[catalog['Source'] == 'Product Catalog']['Registered Product']
+        catalog = catalog[catalog['Source'] == 'Product Catalog']
+        catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
+        catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
+        # Take Top Similar Product
+        catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity'], ascending=False).head(1)
+        return catalog['Registered Product'].values[0]
+    elif type == 'Probabilistic Search':
+        catalog = catalog[catalog['Source'] == 'Product Catalog']
+        # Based on probability
+        catalog['Concat Input'] = user_input + ' dan ' + catalog['Registered Product'].astype(str)
+        catalog['Similarity Score'] = catalog['Concat Input'].apply(lambda x: pipe_match(x)[0]['score'])
+        catalog = catalog.sort_values(by=['Similarity Score'], ascending=False).head(1)
+        return f"{np.round(catalog['Similarity Score'].values[0] * 100,2)}% probability of being a {catalog['Registered Product'].values[0]}"
+def app(input, type):
+    if input is None or type is None:
+        return "Please fill in the input and select the search type"
     catalog = prepare_catalog()
+    return decision(input, type, catalog, "Registered Product")
 # Initialize the app
 demo = gr.Interface(
     fn=app,
+    inputs=[
+        gr.Textbox(),
+        gr.Radio(["Fuzzy Search", "Probabilistic Search", "Training Mode"], type="value")
+        ],
     outputs="text",
+    examples= [
+        ['Petro Nitrat 16-16-16','Fuzzy Search'],
+        ['Petro Nitrat 15-15-15','Fuzzy Search'],
+        ['Gramoxone 1 Liter','Fuzzy Search'],
+        ['Indomie Goreng Aceh','Fuzzy Search']
+        ],
     title = 'Fertilizer Catalog Engine 🌽',
     description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
     article= """
     ### About The App
     This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map
     free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
     decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
     the catalog. <br>
     ### How Does it Work?
     This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
     When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be
     added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
     model will be able to learn how fertilizer products (especially the local ones) look like. <br>
     ### What are the Flags For?
+    The flag is a part of the "Active Transfer Learning" feature of this app when the user selects "Training Mode". When a user flags an output as "Correct" or "Incorrect",
+    the developer will be able to fine-tune the model using the user's input, hence improving the model's performance when the user selects "Probabilistic Search". So, please
+    help us to improve the model by flagging the prediction result 🙏 <br>
     ### I want to test multiple inputs at once!
     You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
     programmatically. <br>
 # Run the app
 if __name__ == "__main__":
+    demo.launch(show_api=True)