Spaces:

matthewfarant
/

fertilizer-catalog-engine

Runtime error

App Files Files Community

fertilizer-catalog-engine / app.py

matthewfarant

Update app.py

4560649 about 1 year ago

raw

history blame

7.19 kB

	import functions.extract_function as get
	import functions.preprocessing_function as preprocess
	import functions.modelling_function as modelling

	import os
	import re
	import math
	import numpy as np
	from rapidfuzz import process, fuzz, utils
	from simpletransformers.classification import ClassificationModel
	from transformers import pipeline
	import gradio as gr

	def is_nan(text):
	try:
	value = float(text)
	return math.isnan(value)
	except ValueError:
	return False

	# Function for preparing catalog
	def prepare_catalog():
	# Load internal catalog
	product_catalog = get.internal_data('catalog')
	# Load external catalog
	registered_fertilizers = get.registered_fertilizer_data()
	# Product catalog cleaning
	product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True)
	product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
	product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
	# Removing Duplicates:
	product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0)
	# 1. Only take registered fertilizers that is NOT in the existing product catalog
	registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80)
	# 2. Combine product catalog and registered fertilizers
	combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers')
	# 3. Remove duplicates
	combined_catalog = combined_catalog.drop_duplicates()
	# Use lambda function to extract the formula from Registered Product column
	combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s[- ]\s\d{1,2}\s[- ]\s\d{1,2}', x))
	# if formula is empty list, then replace it with NaN, else take the first item in the formula list
	combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0])
	return combined_catalog

	# Your existing decision function
	def decision(user_input, catalog, product_name_catalog):
	# Initialize the model
	pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN'))

	# Extract formula
	user_input_formula = re.findall(r'\d{1,2}\s[- ]\s\d{1,2}\s[- ]\s\d{1,2}', user_input)
	user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]

	# Similar Product
	catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
	catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))

	# Take Top Similar Product. Take "Product Catalog" first
	catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)

	# Condition
	if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
	return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
	elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
	return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
	elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
	return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
	elif catalog['Similarity Score'].values[0] < 80:
	if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
	return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
	else:
	return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
	else:
	return "Product is not a Fertilizer"

	def app(input):
	catalog = prepare_catalog()
	return decision(input, catalog, "Registered Product")

	# Initialize the app
	demo = gr.Interface(
	fn=app,
	inputs="text",
	outputs="text",
	examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
	title = 'Fertilizer Catalog Engine 🌽',
	description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
	article= """


	### About The App

	This app is built as a part of the Data Science Weekend 2023 Challenge submission. This app aims to help fertilizer companies to map
	free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
	decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
	the catalog. <br>

	### How Does it Work?

	This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
	When a product is not available in the catalog, we will use a IndoBERT model to determine if the product is a fertilizer and eligible to be
	added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
	model will be able to learn how fertilizer products (especially the local ones) look like. <br>

	### What is the Flags For?

	The flag is a part of the Active Transfer Learning feature of this app. When a user flags a product as "Correct" or "Incorrect", the developer
	will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
	flagging the prediction result! <br>

	### I want to test multiple inputs at once!

	You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
	programmatically. <br>

	""",
	api_name='search',
	flagging_options=["Correct","Incorrect"],
	theme = gr.themes.Soft()
	)

	# Run the app

	if __name__ == "__main__":
	demo.launch(show_api=True)