Spaces:
Runtime error
Runtime error
import functions.extract_function as get | |
import functions.preprocessing_function as preprocess | |
import functions.modelling_function as modelling | |
import os | |
import re | |
import math | |
import numpy as np | |
from rapidfuzz import process, fuzz, utils | |
from simpletransformers.classification import ClassificationModel | |
from transformers import pipeline | |
import gradio as gr | |
# set current directory | |
os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
def is_nan(text): | |
try: | |
value = float(text) | |
return math.isnan(value) | |
except ValueError: | |
return False | |
# Function for preparing catalog | |
def prepare_catalog(): | |
# Load internal catalog | |
product_catalog = get.internal_data('catalog') | |
# Load external catalog | |
registered_fertilizers = get.registered_fertilizer_data() | |
# Product catalog cleaning | |
product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True) | |
product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
# Removing Duplicates: | |
product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0) | |
# 1. Only take registered fertilizers that is NOT in the existing product catalog | |
registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80) | |
# 2. Combine product catalog and registered fertilizers | |
combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers') | |
# 3. Remove duplicates | |
combined_catalog = combined_catalog.drop_duplicates() | |
# Use lambda function to extract the formula from Registered Product column | |
combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x)) | |
# if formula is empty list, then replace it with NaN, else take the first item in the formula list | |
combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0]) | |
return combined_catalog | |
# Your existing decision function | |
def decision(user_input, type, catalog, product_name_catalog): | |
# Initialize the model | |
pipe_detect = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN')) | |
pipe_match = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-matching", token = os.getenv('HF_MY_TOKEN')) | |
# Extract formula | |
user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input) | |
user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0] | |
if type == 'Fuzzy Search': | |
# Similar Product | |
catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process)) | |
catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process)) | |
# Take Top Similar Product. Take "Product Catalog" first | |
catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1) | |
# Condition | |
if catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
return f"[1] Product is Available in Catalog (SKU Registered as *{catalog['Registered Product'].values[0]}*)" | |
elif catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
return f"[2] Add as New Product (Registered in Kementan as *{catalog['Registered Product'].values[0]}*)" | |
elif catalog['Similarity Score'].values[0] >= 80 and catalog['Formula Similarity'].values[0] < 100: | |
return f"[3] Add as New Product (Similar to *{catalog['Registered Product'].values[0]}* in {catalog['Source'].values[0]} but with different formula)" | |
elif catalog['Similarity Score'].values[0] < 80: | |
if pipe_detect(user_input)[0]['label'] == 'Fertilizer' and pipe_detect(user_input)[0]['score'] > 0.8: | |
return f"[4] Add as New Product ({pipe_detect(user_input)[0]['score'] * 100}% probability of being a fertilizer)" | |
else: | |
return f"[5] Product might not be a Fertilizer ({np.round(pipe_detect(user_input)[0]['score'] * 100,2)}% probability of being a {pipe_detect(user_input)[0]['label']})" | |
else: | |
return "[6] Product is not a Fertilizer" | |
elif type == 'Training Mode': | |
# Same like above, but only match with catalog[catalog['Source'] == 'Product Catalog']['Registered Product'] | |
catalog = catalog[catalog['Source'] == 'Product Catalog'] | |
catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process)) | |
catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process)) | |
# Take Top Similar Product | |
catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity'], ascending=False).head(1) | |
return catalog['Registered Product'].values[0] | |
elif type == 'Probabilistic Search': | |
catalog = catalog[catalog['Source'] == 'Product Catalog'] | |
# Based on probability | |
catalog['Concat Input'] = user_input + ' dan ' + catalog['Registered Product'].astype(str) | |
catalog['Similarity Score'] = catalog['Concat Input'].apply(lambda x: pipe_match(x)[0]['score']) | |
catalog = catalog.sort_values(by=['Similarity Score'], ascending=False).head(1) | |
return f"{np.round(catalog['Similarity Score'].values[0] * 100,2)}% probability of being a {catalog['Registered Product'].values[0]}" | |
def app(input, type): | |
if input is None or type is None: | |
return "Please fill in the input and select the search type" | |
catalog = prepare_catalog() | |
return decision(input, type, catalog, "Registered Product") | |
# Initialize the app | |
demo = gr.Interface( | |
fn=app, | |
inputs=[ | |
gr.Textbox(), | |
gr.Radio(["Fuzzy Search", "Probabilistic Search", "Training Mode"], type="value") | |
], | |
outputs="text", | |
examples= [ | |
['Petro Nitrat 16-16-16','Fuzzy Search'], | |
['Petro Nitrat 15-15-15','Fuzzy Search'], | |
['Gramoxone 1 Liter','Fuzzy Search'], | |
['Indomie Goreng Aceh','Fuzzy Search'] | |
], | |
title = 'Fertilizer Catalog Engine 🌽', | |
description = 'Catalog Search Engine and Decision Support System for Fertilizer Company', | |
article= """ | |
### About The App | |
This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map | |
free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to | |
decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to | |
the catalog. <br> | |
### How Does it Work? | |
This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not. | |
When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be | |
added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the | |
model will be able to learn how fertilizer products (especially the local ones) look like. <br> | |
### What are the Flags For? | |
The flag is a part of the "Active Transfer Learning" feature of this app when the user selects "Training Mode". When a user flags an output as "Correct" or "Incorrect", | |
the developer will be able to fine-tune the model using the user's input, hence improving the model's performance when the user selects "Probabilistic Search". So, please | |
help us to improve the model by flagging the prediction result 🙏 <br> | |
### I want to test multiple inputs at once! | |
You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs | |
programmatically. <br> | |
""", | |
api_name='search', | |
allow_flagging='manual', | |
flagging_options=["Correct","Incorrect"], | |
flagging_dir='flagging/', | |
theme = gr.themes.Soft() | |
) | |
# Run the app | |
if __name__ == "__main__": | |
demo.launch(show_api=True) |