Spaces:
Runtime error
Runtime error
import functions.extract_function as get | |
import functions.preprocessing_function as preprocess | |
import functions.modelling_function as modelling | |
import os | |
import re | |
import math | |
import numpy as np | |
from rapidfuzz import process, fuzz, utils | |
from simpletransformers.classification import ClassificationModel | |
from transformers import pipeline | |
import gradio as gr | |
def is_nan(text): | |
try: | |
value = float(text) | |
return math.isnan(value) | |
except ValueError: | |
return False | |
# Function for preparing catalog | |
def prepare_catalog(): | |
# Load internal catalog | |
product_catalog = get.internal_data('catalog') | |
# Load external catalog | |
registered_fertilizers = get.registered_fertilizer_data() | |
# Product catalog cleaning | |
product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True) | |
product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower() | |
# Removing Duplicates: | |
product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0) | |
# 1. Only take registered fertilizers that is NOT in the existing product catalog | |
registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80) | |
# 2. Combine product catalog and registered fertilizers | |
combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers') | |
# 3. Remove duplicates | |
combined_catalog = combined_catalog.drop_duplicates() | |
# Use lambda function to extract the formula from Registered Product column | |
combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x)) | |
# if formula is empty list, then replace it with NaN, else take the first item in the formula list | |
combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0]) | |
return combined_catalog | |
# Your existing decision function | |
def decision(user_input, catalog, product_name_catalog): | |
# Initialize the model | |
pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN')) | |
# Extract formula | |
user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input) | |
user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0] | |
# Similar Product | |
catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process)) | |
catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process)) | |
# Take Top Similar Product. Take "Product Catalog" first | |
catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1) | |
# Condition | |
if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})" | |
elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])): | |
return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})" | |
elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100: | |
return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)" | |
elif catalog['Similarity Score'].values[0] < 80: | |
if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8: | |
return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)" | |
else: | |
return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})" | |
else: | |
return "Product is not a Fertilizer" | |
def app(input): | |
catalog = prepare_catalog() | |
return decision(input, catalog, "Registered Product") | |
# Initialize the app | |
demo = gr.Interface( | |
fn=app, | |
inputs="text", | |
outputs="text", | |
examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'], | |
title = 'Fertilizer Catalog Engine 🌽', | |
description = 'Catalog Search Engine and Decision Support System for Fertilizer Company', | |
article= """ | |
### About The App | |
This app is built as a part of the Data Science Weekend 2023 Challenge submission. This app aims to help fertilizer companies to map | |
free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to | |
decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to | |
the catalog. <br> | |
### How Does it Work? | |
This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not. | |
When a product is not available in the catalog, we will use a IndoBERT model to determine if the product is a fertilizer and eligible to be | |
added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the | |
model will be able to learn how fertilizer products (especially the local ones) look like. <br> | |
### What is the Flags For? | |
The flag is a part of the Active Transfer Learning feature of this app. When a user flags a product as "Correct" or "Incorrect", the developer | |
will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by | |
flagging the prediction result! <br> | |
### I want to test multiple inputs at once! | |
You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs | |
programmatically. <br> | |
""", | |
api_name='search', | |
flagging_options=["Correct","Incorrect"], | |
theme = gr.themes.Soft() | |
) | |
# Run the app | |
if __name__ == "__main__": | |
demo.launch(show_api=True) |