matthewfarant commited on
Commit
d6df412
β€’
1 Parent(s): 95ecb1f
Files changed (1) hide show
  1. app.py +69 -39
app.py CHANGED
@@ -11,6 +11,9 @@ from simpletransformers.classification import ClassificationModel
11
  from transformers import pipeline
12
  import gradio as gr
13
 
 
 
 
14
  def is_nan(text):
15
  try:
16
  value = float(text)
@@ -43,73 +46,100 @@ def prepare_catalog():
43
  return combined_catalog
44
 
45
  # Your existing decision function
46
- def decision(user_input, catalog, product_name_catalog):
47
  # Initialize the model
48
- pipe = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = os.getenv('HF_MY_TOKEN'))
49
-
 
50
  # Extract formula
51
  user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
52
  user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
53
 
54
- # Similar Product
55
- catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
56
- catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
57
-
58
- # Take Top Similar Product. Take "Product Catalog" first
59
- catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
60
-
61
- # Condition
62
- if catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
63
- return f"Product is Available in Catalog (SKU Registered as {catalog['Registered Product'].values[0]})"
64
- elif catalog['Similarity Score'].values[0] > 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
65
- return f"Add as New Product (Registered in Kementan as {catalog['Registered Product'].values[0]})"
66
- elif catalog['Similarity Score'].values[0] > 80 and catalog['Formula Similarity'].values[0] < 100:
67
- return f"Add as New Product (Similar to {catalog['Registered Product'].values[0]} in {catalog['Source'].values[0]} but with different formula)"
68
- elif catalog['Similarity Score'].values[0] < 80:
69
- if pipe(user_input)[0]['label'] == 'Fertilizer' and pipe(user_input)[0]['score'] > 0.8:
70
- return f"Add as New Product ({pipe(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
 
 
 
71
  else:
72
- return f"Product might not be a Fertilizer ({np.round(pipe(user_input)[0]['score'] * 100,2)}% probability of being a {pipe(user_input)[0]['label']})"
73
- else:
74
- return "Product is not a Fertilizer"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- def app(input):
 
 
 
 
 
77
  catalog = prepare_catalog()
78
- return decision(input, catalog, "Registered Product")
79
 
80
  # Initialize the app
81
  demo = gr.Interface(
82
  fn=app,
83
- inputs="text",
 
 
 
84
  outputs="text",
85
- examples= ['Petro Nitrat 16-16-16', 'Petro Nitrat 15-15-15', 'Gramoxone 1 Liter', 'Indomie Goreng Aceh'],
 
 
 
 
 
86
  title = 'Fertilizer Catalog Engine 🌽',
87
  description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
88
  article= """
89
-
90
-
91
  ### About The App
92
 
93
  This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map
94
  free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
95
  decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
96
  the catalog. <br>
97
-
98
  ### How Does it Work?
99
-
100
  This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
101
  When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be
102
  added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
103
  model will be able to learn how fertilizer products (especially the local ones) look like. <br>
104
-
105
  ### What are the Flags For?
106
-
107
- The flag is a part of the "Active Transfer Learning" feature of this app. When a user flags an output as "Correct" or "Incorrect", the developer
108
- will be able to fine-tune the model using the user's input, hence improving the model's performance. So, please help us to improve the model by
109
- flagging the prediction result πŸ™ <br>
110
-
111
  ### I want to test multiple inputs at once!
112
-
113
  You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
114
  programmatically. <br>
115
 
@@ -124,4 +154,4 @@ demo = gr.Interface(
124
  # Run the app
125
 
126
  if __name__ == "__main__":
127
- demo.launch(show_api=True)
 
11
  from transformers import pipeline
12
  import gradio as gr
13
 
14
+ # set current directory
15
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
16
+
17
  def is_nan(text):
18
  try:
19
  value = float(text)
 
46
  return combined_catalog
47
 
48
  # Your existing decision function
49
+ def decision(user_input, type, catalog, product_name_catalog):
50
  # Initialize the model
51
+ pipe_detect = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-classifier", token = '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')
52
+ pipe_match = pipeline("text-classification", model="matthewfarant/indobert-fertilizer-matching", token = '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')
53
+
54
  # Extract formula
55
  user_input_formula = re.findall(r'\d{1,2}\s*[- ]\s*\d{1,2}\s*[- ]\s*\d{1,2}', user_input)
56
  user_input_formula = np.nan if len(user_input_formula) == 0 else user_input_formula[0]
57
 
58
+ if type == 'Fuzzy Search':
59
+ # Similar Product
60
+ catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
61
+ catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
62
+
63
+ # Take Top Similar Product. Take "Product Catalog" first
64
+ catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity', 'Source'], ascending=[False, False, True]).head(1)
65
+
66
+ # Condition
67
+ if catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Product Catalog' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
68
+ return f"[1] Product is Available in Catalog (SKU Registered as *{catalog['Registered Product'].values[0]}*)"
69
+ elif catalog['Similarity Score'].values[0] >= 80 and catalog['Source'].values[0] == 'Registered Fertilizers' and (catalog['Formula Similarity'].values[0] == 100 or is_nan(catalog['Formula'].values[0])):
70
+ return f"[2] Add as New Product (Registered in Kementan as *{catalog['Registered Product'].values[0]}*)"
71
+ elif catalog['Similarity Score'].values[0] >= 80 and catalog['Formula Similarity'].values[0] < 100:
72
+ return f"[3] Add as New Product (Similar to *{catalog['Registered Product'].values[0]}* in {catalog['Source'].values[0]} but with different formula)"
73
+ elif catalog['Similarity Score'].values[0] < 80:
74
+ if pipe_detect(user_input)[0]['label'] == 'Fertilizer' and pipe_detect(user_input)[0]['score'] > 0.8:
75
+ return f"[4] Add as New Product ({pipe_detect(user_input)[0]['score'] * 100}% probability of being a fertilizer)"
76
+ else:
77
+ return f"[5] Product might not be a Fertilizer ({np.round(pipe_detect(user_input)[0]['score'] * 100,2)}% probability of being a {pipe_detect(user_input)[0]['label']})"
78
  else:
79
+ return "[6] Product is not a Fertilizer"
80
+ elif type == 'Training Mode':
81
+ # Same like above, but only match with catalog[catalog['Source'] == 'Product Catalog']['Registered Product']
82
+ catalog = catalog[catalog['Source'] == 'Product Catalog']
83
+ catalog['Similarity Score'] = catalog[product_name_catalog].apply(lambda x: fuzz.token_set_ratio(user_input, x, processor=utils.default_process))
84
+ catalog['Formula Similarity'] = catalog['Formula'].apply(lambda x: fuzz.token_set_ratio(user_input_formula, x, processor=utils.default_process))
85
+
86
+ # Take Top Similar Product
87
+ catalog = catalog.sort_values(by=['Similarity Score', 'Formula Similarity'], ascending=False).head(1)
88
+
89
+ return catalog['Registered Product'].values[0]
90
+ elif type == 'Probabilistic Search':
91
+ catalog = catalog[catalog['Source'] == 'Product Catalog']
92
+
93
+ # Based on probability
94
+
95
+ catalog['Concat Input'] = user_input + ' dan ' + catalog['Registered Product'].astype(str)
96
+
97
+ catalog['Similarity Score'] = catalog['Concat Input'].apply(lambda x: pipe_match(x)[0]['score'])
98
+
99
+ catalog = catalog.sort_values(by=['Similarity Score'], ascending=False).head(1)
100
 
101
+ return f"{np.round(catalog['Similarity Score'].values[0] * 100,2)}% probability of being a {catalog['Registered Product'].values[0]}"
102
+
103
+
104
+ def app(input, type):
105
+ if input is None or type is None:
106
+ return "Please fill in the input and select the search type"
107
  catalog = prepare_catalog()
108
+ return decision(input, type, catalog, "Registered Product")
109
 
110
  # Initialize the app
111
  demo = gr.Interface(
112
  fn=app,
113
+ inputs=[
114
+ gr.Textbox(),
115
+ gr.Radio(["Fuzzy Search", "Probabilistic Search", "Training Mode"], type="value")
116
+ ],
117
  outputs="text",
118
+ examples= [
119
+ ['Petro Nitrat 16-16-16','Fuzzy Search'],
120
+ ['Petro Nitrat 15-15-15','Fuzzy Search'],
121
+ ['Gramoxone 1 Liter','Fuzzy Search'],
122
+ ['Indomie Goreng Aceh','Fuzzy Search']
123
+ ],
124
  title = 'Fertilizer Catalog Engine 🌽',
125
  description = 'Catalog Search Engine and Decision Support System for Fertilizer Company',
126
  article= """
 
 
127
  ### About The App
128
 
129
  This app is built as a part of the Data Science Weekend (DSW) 2023 Challenge submission. This app aims to help fertilizer companies to map
130
  free-text POS data of multiple types of products into their own fertilizer catalog. By using this app, the company will be able to
131
  decide whether a product is already available in their catalog, or whether it is a new product that needs (and eligible) to be added to
132
  the catalog. <br>
 
133
  ### How Does it Work?
 
134
  This app uses a combination of fuzzy matching and machine learning to determine whether a product is already available in the catalog or not.
135
  When a product is not available in the catalog, we will use an IndoBERT model to determine if the product is a fertilizer and eligible to be
136
  added to the catalog. Beforehand, we have fine-tuned the IndoBERT model using a combination of internal and external (web scraping) data, so the
137
  model will be able to learn how fertilizer products (especially the local ones) look like. <br>
 
138
  ### What are the Flags For?
139
+ The flag is a part of the "Active Transfer Learning" feature of this app when the user selects "Training Mode". When a user flags an output as "Correct" or "Incorrect",
140
+ the developer will be able to fine-tune the model using the user's input, hence improving the model's performance when the user selects "Probabilistic Search". So, please
141
+ help us to improve the model by flagging the prediction result πŸ™ <br>
 
 
142
  ### I want to test multiple inputs at once!
 
143
  You can also use our app via API by clicking the "Use via API" below. The API will give developer more flexibility to test multiple inputs
144
  programmatically. <br>
145
 
 
154
  # Run the app
155
 
156
  if __name__ == "__main__":
157
+ demo.launch(show_api=True)