File size: 9,395 Bytes
e5890ec
 
 
 
 
 
 
 
 
 
 
 
 
d6df412
 
 
e5890ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6df412
e5890ec
0a02c34
c31d0b7
d6df412
e5890ec
 
 
 
d6df412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5890ec
d6df412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c31d0b7
d6df412
 
 
 
e5890ec
d6df412
 
 
 
 
 
e5890ec
d6df412
e5890ec
 
 
 
d6df412
 
 
 
e5890ec
d6df412
 
 
 
 
 
e5890ec
 
 
 
 
95ecb1f
e5890ec
 
 
 
 
95ecb1f
e5890ec
 
95ecb1f
d6df412
 
 
e5890ec
 
 
 
 
 
545e1c6
e5890ec
545e1c6
e5890ec
 
 
 
 
 
d6df412
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import functions.extract_function as get
import functions.preprocessing_function as preprocess
import functions.modelling_function as modelling

import os
import re
import math
import numpy as np
from rapidfuzz import process, fuzz, utils
from simpletransformers.classification import ClassificationModel
from transformers import pipeline
import gradio as gr

# set current directory
os.chdir(os.path.dirname(os.path.abspath(__file__)))

def is_nan(text):
    try:
        value = float(text)
        return math.isnan(value)
    except ValueError:
        return False

# Function for preparing catalog
def prepare_catalog():
    # Load internal catalog
    product_catalog = get.internal_data('catalog')
    # Load external catalog
    registered_fertilizers = get.registered_fertilizer_data()
    # Product catalog cleaning
    product_catalog = preprocess.clean_dataframe(product_catalog, 'Product SKU', remove_na=False, remove_non_words=True, remove_symbols=True)
    product_catalog['Product SKU Full Clean'] = product_catalog['Product SKU Clean'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
    product_catalog['Product SKU Full'] = product_catalog['Product SKU'] + ' ' + product_catalog['Brand'].str.lower() + ' ' + product_catalog['Type'].str.lower()
    # Removing Duplicates:
    product_catalog = preprocess.fuzzy_join_compare(product_catalog, 'Product SKU Clean', 'Product SKU Full Clean', registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0)
    # 1. Only take registered fertilizers that is NOT in the existing product catalog
    registered_fertilizers = preprocess.slice_with_filter(registered_fertilizers, 'Nomor Pendaftaran', product_catalog, use_filter=True, filter_condition= product_catalog['Max Similarity Score'] > 80)
    # 2. Combine product catalog and registered fertilizers
    combined_catalog = preprocess.combine_catalog(product_catalog['Product SKU Full'], registered_fertilizers['Nama Lengkap'], 'Product Catalog', 'Registered Fertilizers')
    # 3. Remove duplicates
    combined_catalog = combined_catalog.drop_duplicates()
    # Use lambda function to extract the formula from Registered Product column
    combined_catalog['Formula'] = combined_catalog['Registered Product'].apply(lambda x: re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', x))
    # if formula is empty list, then replace it with NaN, else take the first item in the formula list
    combined_catalog['Formula'] = combined_catalog['Formula'].apply(lambda x: np.nan if len(x) == 0 else x[0])
    return combined_catalog

# Your existing decision function
def decision(user_input, type, catalog, product_name_catalog):
    # Initialize the model
    pipe_detect = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN'))
    pipe_match = pipeline("text-classification", model="matthewfarant/autotrain-fertilizer-pair-classify", token = os.getenv('HF_MY_TOKEN'))

    # Extract formula
    user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
    user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]

    if type == 'Fuzzy Search':
        # Similar Product
        catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
        catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
        
        # Take Top Similar Product. Take "Product Catalog" first
        catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)

        # Condition
        if catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
            return f"[1] Product is Available in Catalog (SKU Registered as *{catalog['Registered Product'].values[0]}*)"
        elif catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
            return f"[2] Add as New Product (Registered in Kementan as *{catalog['Registered Product'].values[0]}*)"
        elif catalog['Similarity Score'].values[0] >= 80 and catalog['Formula Similarity'].values[0] < 100:
            return f"[3] Add as New Product (Similar to *{catalog['Registered Product'].values[0]}* in {catalog['Source'].values[0]} but with different formula)"
        elif catalog['Similarity Score'].values[0] < 80:
            if pipe_detect(user_input)[0]['label'] == 'Fertilizer' and pipe_detect(user_input)[0]['score'] > 0.8:
                return f"[4] Add as New Product ({pipe_detect(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
            else:
                return f"[5] Product might not be a Fertilizer ({np.round(pipe_detect(user_input)[0]['score'] * 100,2)}% probability of being a {pipe_detect(user_input)[0]['label']})"
        else:
            return "[6] Product is not a Fertilizer"
    elif type == 'Training Mode':
        # Same like above, but only match with catalog[catalog['Source'] == 'Product Catalog']['Registered Product']
        catalog = catalog[catalog['Source'] == 'Product Catalog']
        catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
        catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))

        # Take Top Similar Product
        catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity'], ascending=False).head(1)

        return catalog['Registered Product'].values[0]
    elif type == 'Probabilistic Search':
        catalog = catalog[catalog['Source'] == 'Product Catalog']

        # Based on probability

        catalog['Concat Input'] = user_input + '[SEP]' + catalog['Registered Product'].astype(str)

        catalog['Similarity Score'] = catalog['Concat Input'].apply(lambda x: pipe_match(x)[0]['score'])

        catalog = catalog.sort_values(by=['Similarity Score'], ascending=False).head(1)

        return f"{np.round(catalog['Similarity Score'].values[0] * 100,2)}% probability of being a {catalog['Registered Product'].values[0]}"


def app(input, type):
    if input is None or type is None:
        return "Please fill in the input and select the search type"
    catalog = prepare_catalog()
    return decision(input, type, catalog, "Registered Product")
    
# Initialize the app
demo = gr.Interface(
    fn=app,
    inputs=[
        gr.Textbox(),
        gr.Radio(["Fuzzy Search", "Probabilistic Search", "Training Mode"], type="value")
        ],
    outputs="text",
    examples= [
        ['Petro Nitrat 16-16-16','Fuzzy Search'], 
        ['Petro Nitrat 15-15-15','Fuzzy Search'],
        ['Gramoxone 1 Liter','Fuzzy Search'],
        ['Indomie Goreng Aceh','Fuzzy Search']
        ],
    title = 'Fertilizer Catalog Engine 🌽',
    description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
    article= """
    ### About The App 
     
    This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map 
    free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
    decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to 
    the catalog. <br>
    ### How Does it Work?
    This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
    When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be
    added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
    model will be able to learn how fertilizer products (especially the local ones) look like. <br>
    ### What are the Flags For?
    The flag is a part of the "Active Transfer Learning" feature of this app when the user selects "Training Mode". When a user flags an output as "Correct" or "Incorrect",
    the developer will be able to fine-tune the model using the user's input, hence improving the model's performance when the user selects "Probabilistic Search". So, please 
    help us to improve the model by flagging the prediction result 🙏 <br>
    ### I want to test multiple inputs at once!
    You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs 
    programmatically. <br>
    
    """,
    api_name='search',
    allow_flagging='manual',
    flagging_options=["Correct","Incorrect"],
    flagging_dir='flagging/',
    theme = gr.themes.Soft()
    )

# Run the app

if __name__ == "__main__":
    demo.launch(show_api=True)