matthewfarant commited on
Commit
b279c69
·
1 Parent(s): 484b915

Initial commit

Browse files
functions/extract_function.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import requests
4
+ import pandas as pd
5
+
6
+ def internal_data(type):
7
+ """
8
+ Extract internal data from either catalog or query.
9
+
10
+ :param type: str, 'catalog' or 'query'
11
+
12
+ :return: pandas.DataFrame, dataframe containing product name and category name
13
+ """
14
+ if type == 'catalog':
15
+ dfs = []
16
+ for file in os.listdir('catalog'):
17
+ if file.endswith('.xlsx'):
18
+ df = pd.read_excel('catalog/' + file)
19
+ dfs.append(df)
20
+ catalog = pd.concat(dfs, ignore_index=True)
21
+ return catalog
22
+
23
+ elif type == 'query':
24
+ dfs = []
25
+ for file in os.listdir('query'):
26
+ if file.endswith('.xlsx'):
27
+ df = pd.read_excel('query/' + file)
28
+ dfs.append(df)
29
+ query = pd.concat(dfs, ignore_index=True)
30
+ return query
31
+
32
+ else:
33
+ return 'Error: type must be either catalog or query'
34
+
35
+ def registered_fertilizer_data():
36
+ """
37
+ Scrape registered fertilizer data in Ministry of Agriculture website.
38
+
39
+ :param type: str, 'organik' or 'anorganik'
40
+
41
+ :return: pandas.DataFrame, dataframe containing registered fertilizer data
42
+ """
43
+ # check if the "external" folder is empty
44
+ if os.listdir('external') == []:
45
+ print('External folder is empty. Extracting data from Ministry of Agriculture website...')
46
+ print('Extracting Organic Fertilizer Data...')
47
+ dfs1 = []
48
+ # Scrape every table in every page: Organic
49
+ i = 1
50
+ while True:
51
+ url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
52
+ result = requests.get(url).content
53
+ try:
54
+ df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
55
+ df['Page Number'] = i
56
+ dfs1.append(df)
57
+ i += 1
58
+ except IndexError:
59
+ break
60
+
61
+ registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
62
+
63
+ print('Extracting Inorganic Fertilizer Data...')
64
+ dfs2 = []
65
+ # Scrape every table in every page: Inorganic
66
+ i = 1
67
+ while True:
68
+ url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
69
+ result = requests.get(url).content
70
+ try:
71
+ df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
72
+ df['Page Number'] = i
73
+ dfs2.append(df)
74
+ i += 1
75
+ except IndexError:
76
+ break
77
+
78
+ registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()
79
+
80
+ registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
81
+ registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
82
+ return registered_fertilizers
83
+
84
+ else :
85
+ return pd.read_csv('external/registered_fertilizers.csv')
86
+
87
+ def scrape_result():
88
+ """
89
+ Extract scraped result data.
90
+
91
+ :return: pandas.DataFrame, dataframe containing scraped result data
92
+ """
93
+ dfs = []
94
+
95
+ for filename in os.listdir('scrape_result'):
96
+ df = pd.read_csv('scrape_result/'+filename)
97
+ dfs.append(df)
98
+
99
+ # combine
100
+ final_df = pd.concat(dfs, ignore_index=True)
101
+ return final_df
functions/modelling_function.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import yaml
6
+ import os
7
+ import warnings
8
+ from rapidfuzz import fuzz, utils
9
+ from simpletransformers.classification import ClassificationModel, ClassificationArgs
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.metrics import confusion_matrix, classification_report
12
+ from scipy.special import softmax
13
+
14
+
15
+ def generate_training_data(df, text_column, label_column, external_table = None, external_column = None, add_external_table=False, sampling=True):
16
+ """
17
+ This function generates training data for the model.
18
+
19
+ :param df: pandas.DataFrame, dataframe containing product name and category name
20
+ :param text_column: str, column name containing product name
21
+ :param label_column: str, column name containing category name
22
+ :param external_table: pandas.DataFrame, dataframe containing product name and category name
23
+ :param external_column: str, column name containing product name
24
+ :param add_external_table: bool, whether to add external table or not
25
+ :param sampling: bool, whether to do sampling or not
26
+
27
+ :return: pandas.DataFrame, dataframe containing product name and category name
28
+ """
29
+ if os.listdir('training') == []:
30
+ print('Training folder is empty. Generating training data...')
31
+ units = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['excluded_words']
32
+
33
+ df['category_name'] = df[label_column].apply(lambda x: 'Fertilizer - High' if isinstance(x, list) and len(x) == 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - High' if isinstance(x, list) and len(x) == 1 and 'Weeds & Pest Control' in x else 'Fertilizer - Medium' if isinstance(x, list) and len(x) > 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - Medium' if isinstance(x, list) and len(x) > 1 and 'Weeds & Pest Control' in x else 'Others')
34
+ df = df[[text_column, 'category_name']]
35
+
36
+ # take only where category_name is Ferilizer - High or Pesticide - High or Others
37
+ df = df[df['category_name'].isin(['Fertilizer - High', 'Pesticide - High', 'Others'])]
38
+ # exclude product name that contains units AND category_name is Others
39
+ df = df[~(df[text_column].str.contains('|'.join(units)) & (df['category_name'] == 'Others'))]
40
+
41
+ if add_external_table:
42
+ external_table['category_name'] = 'Fertilizer - High'
43
+ external_table = external_table[[external_column, 'category_name']]
44
+ external_table.columns = [text_column, 'category_name']
45
+
46
+ training_df = pd.concat([external_table, df])
47
+ training_df.columns = ['product_name','category_name']
48
+
49
+ training_df['category_name'] = training_df['category_name'].apply(lambda x: 0 if x == 'Fertilizer - High' else 1 if x == 'Pesticide - High' else 2)
50
+ if sampling:
51
+ return pd.concat([training_df[training_df['category_name'] == 0].sample(n=1250), training_df[training_df['category_name'] == 1].sample(n=1250), training_df[training_df['category_name'] == 2].sample(n=1500)])
52
+ else:
53
+ return training_df
54
+ else:
55
+ return df
56
+ else:
57
+ training_df = pd.read_csv('training/training_data.csv')
58
+ return training_df
59
+
60
+ def category_reassign(row, reference_df, checked_category, threshold=70):
61
+ """
62
+ This function reassigns the category name of a product based on the similarity score between the product name and the reference dataframe.
63
+
64
+ :param row: pandas.Series, row of dataframe
65
+ :param reference_df: pandas.DataFrame, dataframe containing product name and category name
66
+ :param checked_category: str, category name to be checked
67
+ :param threshold: int, threshold for similarity score
68
+
69
+ :return: str, category name
70
+ """
71
+ if row['category_name'] == checked_category:
72
+ for i in range(len(reference_df)):
73
+ row2 = reference_df.iloc[i]
74
+ if row2['category_name'] != checked_category:
75
+ if fuzz.ratio(row['product_name'], row2['product_name'], processor= utils.default_process) >= threshold:
76
+ return row2['category_name']
77
+ return checked_category
78
+ else:
79
+ return row['category_name']
80
+
81
+ def train_model(df, stratify=True, model_type='bert', use_existing_model=False, model_name=None):
82
+ """
83
+ This function trains the model using the configuration in config.yaml
84
+
85
+ :param df: pandas.DataFrame, dataframe containing product name and category name
86
+ :param stratify: bool, whether to do stratified sampling or not
87
+ :param model_type: str, type of model to use
88
+ :param use_existing_model: bool, whether to use existing model or not
89
+ :param model_name: str, name of existing model
90
+
91
+ :return: simpletransformers.classification.ClassificationModel, model
92
+ :return: numpy.ndarray, predictions
93
+ :return: str, classification report
94
+ :return: pandas.DataFrame, training dataframe
95
+ :return: pandas.DataFrame, testing dataframe
96
+ :return: list, list of class names
97
+ """
98
+ warnings.filterwarnings('ignore')
99
+
100
+ test_size = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['training_args']['test_size']
101
+ train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['category_name'])
102
+
103
+ # Optional model configuration
104
+ model_config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_args']
105
+ model_args = ClassificationArgs()
106
+ model_args.num_train_epochs = model_config['num_train_epochs']
107
+ model_args.train_batch_size = model_config['train_batch_size']
108
+ model_args.eval_batch_size = model_config['eval_batch_size']
109
+ model_args.overwrite_output_dir = model_config['overwrite_output_dir']
110
+ model_args.fp16 = model_config['fp16']
111
+ model_args.do_lower_case = model_config['do_lower_case']
112
+
113
+ # Create a ClassificationModel
114
+ model_detail = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_types']
115
+ class_names = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['class_names']
116
+
117
+ if use_existing_model:
118
+ model = ClassificationModel(model_type, model_name, num_labels=len(class_names), args=model_args, use_cuda=False)
119
+ else:
120
+ model = ClassificationModel(model_type, model_detail[model_type], num_labels=len(class_names), args=model_args, use_cuda=False)
121
+
122
+ # Train the model
123
+ model.train_model(train_df)
124
+
125
+ # Evaluate the model
126
+ result, model_outputs, wrong_predictions = model.eval_model(test_df)
127
+ preds = np.argmax(model_outputs, axis=1)
128
+ class_report =classification_report(test_df['category_name'], preds, target_names=class_names)
129
+
130
+ return model, preds, class_report, train_df, test_df, class_names
131
+
132
+ def save_model(model, model_name):
133
+ """
134
+ This function saves the model.
135
+
136
+ :param model: simpletransformers.classification.ClassificationModel, model
137
+ :param model_name: str, name of model
138
+
139
+ :return: None
140
+ """
141
+ model.model.save_pretrained(model_name)
142
+ model.tokenizer.save_pretrained(model_name)
143
+ model.config.save_pretrained(model_name + '/')
144
+ print('Model saved to ' + model_name + '/')
145
+
146
+ def show_confusion_matrix(test_category, preds, class_names):
147
+ """
148
+ This function shows the confusion matrix.
149
+
150
+ :param test_category: numpy.ndarray, array of category name
151
+ :param preds: numpy.ndarray, array of predictions
152
+ :param class_names: list, list of class names
153
+
154
+ :return: matplotlib.axes._subplots.AxesSubplot, confusion matrix
155
+ """
156
+ cm = confusion_matrix(test_category, preds)
157
+ df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
158
+ hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
159
+ hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
160
+ hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
161
+ plt.ylabel('True Topics')
162
+ plt.xlabel('Predicted Topics')
163
+
164
+ def predict_proba(model,text):
165
+ """
166
+ This function predicts the probability of each class (in a text form).
167
+
168
+ :param model: simpletransformers.classification.ClassificationModel, model
169
+ :param text: str, text to predict
170
+
171
+ :return: numpy.ndarray, array of probabilities
172
+ """
173
+ proba = softmax(model.predict([text])[1])[0]
174
+ print('-----------------------------')
175
+ print('Text to Predict: ', text)
176
+ print('Probability of each class:')
177
+ print('Fertilizer: ', proba[0])
178
+ print('Pesticide: ', proba[1])
179
+ print('Others: ', proba[2])
180
+
181
+ def predict_proba_array(model,text):
182
+ """
183
+ This function predicts the probability of each class (in an array form).
184
+
185
+ :param model: simpletransformers.classification.ClassificationModel, model
186
+ :param text: str, text to predict
187
+
188
+ :return: numpy.ndarray, array of probabilities
189
+ """
190
+ proba = softmax(model.predict([text])[1])[0]
191
+ return proba
functions/preprocessing_function.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from rapidfuzz import process, fuzz, utils
4
+
5
+ def clean_dataframe(df, column, remove_na=True, remove_non_words=True, remove_symbols=True, remove_duplicates=True):
6
+ """
7
+ This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
8
+
9
+ Parameters:
10
+ df (pandas.DataFrame): The dataframe to clean.
11
+ column (str): The column to clean.
12
+ remove_na (bool): Whether to remove NaN or not.
13
+ remove_non_words (bool): Whether to remove non-words or not.
14
+ remove_symbols (bool): Whether to remove symbols or not.
15
+ remove_duplicates (bool): Whether to remove duplicates or not.
16
+
17
+ Returns:
18
+ pandas.DataFrame: The cleaned dataframe.
19
+ """
20
+ # Lowercase the column
21
+ df[column + ' Clean'] = df[column].apply(lambda x: str(x).lower())
22
+
23
+ # Remove non words (symbols, numbers, etc.)
24
+ if remove_non_words:
25
+ df[column + ' Clean'] = ''
26
+ for i in range(len(df)):
27
+ row = df.iloc[i]
28
+ clean_word_list = []
29
+ for word in str(row[column]).lower().split():
30
+ if not any(char.isdigit() for char in word):
31
+ clean_word_list.append(word)
32
+ df.at[i, column + ' Clean'] = ' '.join(clean_word_list)
33
+
34
+ # Remove symbols, but keep numbers
35
+ if remove_symbols:
36
+ df[column + ' Clean'] = df[column + ' Clean'].apply(lambda x: ''.join(letter for letter in x if letter.isalnum() or letter.isspace()))
37
+
38
+ # Drop if the new column is NaN or empty string (when the whitespace is removed, it is '')
39
+ if remove_na:
40
+ df = df[df[column + ' Clean'].notna()]
41
+ df = df[df[column + ' Clean'].replace(' ','') != '']
42
+
43
+ # Remove duplicates
44
+ if remove_duplicates:
45
+ df = df.drop_duplicates(subset=[column + ' Clean'])
46
+
47
+ return df
48
+
49
+ def fuzzy_join(row, df_reference, column_reference, column_matched_to, take_regist_number=False, take_source = False, set_ratio_weight=0.5, ratio_weight=0.5):
50
+ """
51
+ This function applies fuzzy join to the given row and returns the matched product name and nomor pendaftaran
52
+ based on the maximum similarity score between the two columns.
53
+
54
+ Parameters:
55
+ row (pandas.Series): The row to apply fuzzy join on.
56
+ df_reference (pandas.DataFrame): The dataframe to compare with.
57
+ column_reference (str): The column to use for fuzzy join.
58
+ column_matched_to (str): The column to compare with.
59
+ take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
60
+ set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
61
+ ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
62
+
63
+ Returns:
64
+ pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
65
+ """
66
+ similar_product_name = ''
67
+ similarity_score = 0
68
+ nomor_pendaftaran = ''
69
+ source = ''
70
+ for product_name in df_reference[column_reference]:
71
+ if set_ratio_weight == 0:
72
+ score = fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
73
+ elif ratio_weight == 0:
74
+ score = fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process)
75
+ else:
76
+ score = set_ratio_weight * fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) + ratio_weight * fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
77
+
78
+ if score > similarity_score:
79
+ similarity_score = score
80
+ similar_product_name = product_name
81
+ if take_regist_number:
82
+ nomor_pendaftaran = df_reference[df_reference[column_reference] == product_name]['Nomor Pendaftaran'].iloc[0]
83
+ if take_source:
84
+ source = df_reference[df_reference[column_reference] == product_name]['Source'].iloc[0]
85
+
86
+ if take_regist_number and take_source:
87
+ return similar_product_name, similarity_score, nomor_pendaftaran, source
88
+ elif take_regist_number:
89
+ return similar_product_name, similarity_score, nomor_pendaftaran
90
+ elif take_source:
91
+ return similar_product_name, similarity_score, source
92
+ else:
93
+ return similar_product_name, similarity_score
94
+
95
+
96
+ def fuzzy_join_compare(df, first_column, second_column, registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0):
97
+ """
98
+ This function applies fuzzy join to the given dataframe and returns the matched product name and nomor pendaftaran
99
+ based on the maximum similarity score between the two columns.
100
+
101
+ Parameters:
102
+ df (pandas.DataFrame): The dataframe to apply fuzzy join on.
103
+ first_column (str): The first column to use for fuzzy join.
104
+ second_column (str): The second column to compare with.
105
+ registered_fertilizers (pandas.DataFrame): The dataframe containing the registered fertilizers.
106
+ take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
107
+ set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
108
+ ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
109
+
110
+ Returns:
111
+ pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
112
+ """
113
+ df['Matched Product Name 1'], df['Similarity Score 1'], df['Nomor Pendaftaran 1'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', first_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
114
+ df['Matched Product Name 2'], df['Similarity Score 2'], df['Nomor Pendaftaran 2'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', second_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
115
+
116
+ # Take the maximum similarity score and take the matched product name and nomor pendaftaran based on that
117
+ df['Max Similarity Score'] = df[['Similarity Score 1', 'Similarity Score 2']].max(axis=1)
118
+ # If condition: if similarity score 1 is higher than equal to similarity score 2, take the matched product name 1 as matched product name, else take matched product name 2
119
+ df['Matched Product Name'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Matched Product Name 1'], df['Matched Product Name 2'])
120
+ # If condition: if similarity score 1 is higher than equal to similarity score 2, take the nomor pendaftaran 1 as nomor pendaftaran, else take nomor pendaftaran 2
121
+ df['Nomor Pendaftaran'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Nomor Pendaftaran 1'], df['Nomor Pendaftaran 2'])
122
+ # Remove the columns that are no longer needed such as the matched product name 1 and 2, similarity score 1 and 2, and nomor pendaftaran 1 and 2
123
+ df.drop(columns=['Matched Product Name 1', 'Matched Product Name 2', 'Similarity Score 1', 'Similarity Score 2', 'Nomor Pendaftaran 1', 'Nomor Pendaftaran 2'], inplace=True)
124
+
125
+ return df
126
+
127
+ def slice_with_filter(df, column, ref_df, use_filter=False, filter_condition=None):
128
+ """
129
+ This function slices the given dataframe based on the given reference dataframe.
130
+
131
+ :param df: pandas.DataFrame, dataframe to be sliced
132
+ :param column: str, column to be sliced
133
+ :param ref_df: pandas.DataFrame, reference dataframe
134
+ :param use_filter: bool, whether to use filter or not
135
+ :param filter_condition: str, filter condition
136
+
137
+ :return: pandas.DataFrame, sliced dataframe
138
+ """
139
+ if use_filter:
140
+ ref_df = ref_df[filter_condition]
141
+
142
+ return df[~df[column].isin(ref_df[column].to_list())]
143
+
144
+ def combine_catalog(column_1, column_2, source_1, source_2):
145
+ """
146
+ This function combines two columns into one dataframe.
147
+
148
+ :param column_1: pandas.Series, first column
149
+ :param column_2: pandas.Series, second column
150
+ :param source_1: str, source of first column
151
+ :param source_2: str, source of second column
152
+
153
+ :return: pandas.DataFrame, combined dataframe
154
+ """
155
+ combined_catalog = pd.concat([column_1, column_2])
156
+ combined_catalog = combined_catalog.to_frame(name='Registered Product')
157
+ combined_catalog['Source'] = pd.concat([column_1.apply(lambda x: source_1), column_2.apply(lambda x: source_2)])
158
+ combined_catalog.reset_index(drop=True, inplace=True)
159
+
160
+ return combined_catalog
161
+
162
+ def clean_category_dataframe(df, category_column, product_name_column, reference_table, reference_column, split=False):
163
+ """
164
+ This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
165
+
166
+ Parameters:
167
+ df (pandas.DataFrame): The dataframe to clean.
168
+ category_column (str): The column containing category name.
169
+ product_name_column (str): The column containing product name.
170
+ reference_table (pandas.DataFrame): The reference table to be used for fuzzy join.
171
+ reference_column (str): The column to be used for fuzzy join.
172
+ split (bool): Whether to split the dataframe into two or not.
173
+
174
+ Returns:
175
+ pandas.DataFrame: The cleaned dataframe.
176
+ """
177
+ # If column does not contain "Category", fill it with "Unknown" (including those that are NaN)
178
+ df[category_column] = df[category_column].apply(lambda x: x if isinstance(x, str) and 'Category' in x else 'Unknown')
179
+ # If column contains "Category", remove the word "Category" and replace "\n" with ","
180
+ df[category_column] = df[category_column].apply(lambda x: x.replace('Category', '').replace('\n', ',') if isinstance(x, str) else x)
181
+ # Replace "Lihat Lebih Banyak" with empty string
182
+ df[category_column] = df[category_column].apply(lambda x: x.replace('Lihat Lebih Banyak', '') if isinstance(x, str) else x)
183
+ # Add category_list column
184
+ df['category_list'] = df[category_column].apply(lambda x: x.split(',') if isinstance(x, str) else x)
185
+ # Add product_name_clean
186
+ df['product_name_clean'] = df[product_name_column].apply(lambda x: str(x).lower().strip())
187
+ # Remove duplicates
188
+ df = df.drop_duplicates(subset=['product_name_clean'], keep = 'last')
189
+ # Left join with product query
190
+ df_reference = reference_table.merge(df[['product_name_clean','category_list']], how='left', left_on=reference_table[reference_column].str.lower().str.strip(), right_on=df['product_name_clean'])
191
+ # convert category_list that contains 'Unknown' to NaN
192
+ df_reference['category_list'] = df_reference['category_list'].apply(lambda x: np.nan if isinstance(x, list) and 'Unknown' in x else x)
193
+ # if the list in category_list contains empty string element, drop that element from the list
194
+ df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i for i in x if i != ''] if isinstance(x, list) else x)
195
+ # Choose final columns
196
+ df_reference = df_reference[['Product Name', 'Product Name Clean', 'category_list']]
197
+ # Strip
198
+ df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i.strip() for i in x] if isinstance(x, list) else x)
199
+
200
+ if split:
201
+ return df_reference, df_reference.dropna(subset=['category_list'])
202
+ else:
203
+ return df_reference