File size: 11,350 Bytes
b279c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz, utils

def clean_dataframe(df, column, remove_na=True, remove_non_words=True, remove_symbols=True, remove_duplicates=True):
    """
    This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.

    Parameters:
    df (pandas.DataFrame): The dataframe to clean.
    column (str): The column to clean.
    remove_na (bool): Whether to remove NaN or not.
    remove_non_words (bool): Whether to remove non-words or not.
    remove_symbols (bool): Whether to remove symbols or not.
    remove_duplicates (bool): Whether to remove duplicates or not.

    Returns:
    pandas.DataFrame: The cleaned dataframe.
    """
    # Lowercase the column
    df[column + ' Clean'] = df[column].apply(lambda x: str(x).lower())
    
    # Remove non words (symbols, numbers, etc.)
    if remove_non_words:
        df[column + ' Clean'] = ''
        for i in range(len(df)):
            row = df.iloc[i]
            clean_word_list = []
            for word in str(row[column]).lower().split():
                if not any(char.isdigit() for char in word):
                    clean_word_list.append(word)
            df.at[i, column + ' Clean'] = ' '.join(clean_word_list)
    
    # Remove symbols, but keep numbers
    if remove_symbols:
        df[column + ' Clean'] = df[column + ' Clean'].apply(lambda x: ''.join(letter for letter in x if letter.isalnum() or letter.isspace()))
    
    # Drop if the new column is NaN or empty string (when the whitespace is removed, it is '')
    if remove_na:
        df = df[df[column + ' Clean'].notna()]
        df = df[df[column + ' Clean'].replace(' ','') != '']
    
    # Remove duplicates
    if remove_duplicates:
        df = df.drop_duplicates(subset=[column + ' Clean'])

    return df

def fuzzy_join(row, df_reference, column_reference, column_matched_to, take_regist_number=False, take_source = False, set_ratio_weight=0.5, ratio_weight=0.5):
    """
    This function applies fuzzy join to the given row and returns the matched product name and nomor pendaftaran
    based on the maximum similarity score between the two columns.

    Parameters:
    row (pandas.Series): The row to apply fuzzy join on.
    df_reference (pandas.DataFrame): The dataframe to compare with.
    column_reference (str): The column to use for fuzzy join.
    column_matched_to (str): The column to compare with.
    take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
    set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
    ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.

    Returns:
    pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
    """
    similar_product_name = ''
    similarity_score = 0
    nomor_pendaftaran = ''
    source = ''
    for product_name in df_reference[column_reference]:
        if set_ratio_weight == 0:
            score = fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
        elif ratio_weight == 0:
            score = fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process)
        else:
            score = set_ratio_weight * fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) + ratio_weight * fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
            
        if score > similarity_score:
            similarity_score = score
            similar_product_name = product_name
            if take_regist_number:
                nomor_pendaftaran = df_reference[df_reference[column_reference] == product_name]['Nomor Pendaftaran'].iloc[0]
            if take_source:
                source = df_reference[df_reference[column_reference] == product_name]['Source'].iloc[0]

    if take_regist_number and take_source:
        return similar_product_name, similarity_score, nomor_pendaftaran, source
    elif take_regist_number:
        return similar_product_name, similarity_score, nomor_pendaftaran
    elif take_source:
        return similar_product_name, similarity_score, source
    else:
        return similar_product_name, similarity_score
    

def fuzzy_join_compare(df, first_column, second_column, registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0):
    """
    This function applies fuzzy join to the given dataframe and returns the matched product name and nomor pendaftaran
    based on the maximum similarity score between the two columns.

    Parameters:
    df (pandas.DataFrame): The dataframe to apply fuzzy join on.
    first_column (str): The first column to use for fuzzy join.
    second_column (str): The second column to compare with.
    registered_fertilizers (pandas.DataFrame): The dataframe containing the registered fertilizers.
    take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
    set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
    ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.

    Returns:
    pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
    """
    df['Matched Product Name 1'], df['Similarity Score 1'], df['Nomor Pendaftaran 1'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', first_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
    df['Matched Product Name 2'], df['Similarity Score 2'], df['Nomor Pendaftaran 2'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', second_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))

    # Take the maximum similarity score and take the matched product name and nomor pendaftaran based on that
    df['Max Similarity Score'] = df[['Similarity Score 1', 'Similarity Score 2']].max(axis=1)
    # If condition: if similarity score 1 is higher than equal to similarity score 2, take the matched product name 1 as matched product name, else take matched product name 2
    df['Matched Product Name'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Matched Product Name 1'], df['Matched Product Name 2'])
    # If condition: if similarity score 1 is higher than equal to similarity score 2, take the nomor pendaftaran 1 as nomor pendaftaran, else take nomor pendaftaran 2
    df['Nomor Pendaftaran'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Nomor Pendaftaran 1'], df['Nomor Pendaftaran 2'])
    # Remove the columns that are no longer needed such as the matched product name 1 and 2, similarity score 1 and 2, and nomor pendaftaran 1 and 2
    df.drop(columns=['Matched Product Name 1', 'Matched Product Name 2', 'Similarity Score 1', 'Similarity Score 2', 'Nomor Pendaftaran 1', 'Nomor Pendaftaran 2'], inplace=True)
    
    return df

def slice_with_filter(df, column, ref_df, use_filter=False, filter_condition=None):
    """
    This function slices the given dataframe based on the given reference dataframe.

    :param df: pandas.DataFrame, dataframe to be sliced
    :param column: str, column to be sliced
    :param ref_df: pandas.DataFrame, reference dataframe
    :param use_filter: bool, whether to use filter or not
    :param filter_condition: str, filter condition

    :return: pandas.DataFrame, sliced dataframe
    """
    if use_filter:
        ref_df = ref_df[filter_condition]

    return df[~df[column].isin(ref_df[column].to_list())]

def combine_catalog(column_1, column_2, source_1, source_2):
    """
    This function combines two columns into one dataframe.

    :param column_1: pandas.Series, first column
    :param column_2: pandas.Series, second column
    :param source_1: str, source of first column
    :param source_2: str, source of second column

    :return: pandas.DataFrame, combined dataframe
    """
    combined_catalog = pd.concat([column_1, column_2])
    combined_catalog = combined_catalog.to_frame(name='Registered Product')
    combined_catalog['Source'] = pd.concat([column_1.apply(lambda x: source_1), column_2.apply(lambda x: source_2)])
    combined_catalog.reset_index(drop=True, inplace=True)

    return combined_catalog

def clean_category_dataframe(df, category_column, product_name_column, reference_table, reference_column, split=False):
    """
    This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.

    Parameters:
    df (pandas.DataFrame): The dataframe to clean.
    category_column (str): The column containing category name.
    product_name_column (str): The column containing product name.
    reference_table (pandas.DataFrame): The reference table to be used for fuzzy join.
    reference_column (str): The column to be used for fuzzy join.
    split (bool): Whether to split the dataframe into two or not.

    Returns:
    pandas.DataFrame: The cleaned dataframe.
    """
    # If column does not contain "Category", fill it with "Unknown" (including those that are NaN)
    df[category_column] = df[category_column].apply(lambda x: x if isinstance(x, str) and 'Category' in x else 'Unknown')
    # If column contains "Category", remove the word "Category" and replace "\n" with ","
    df[category_column] = df[category_column].apply(lambda x: x.replace('Category', '').replace('\n', ',') if isinstance(x, str) else x)
    # Replace "Lihat Lebih Banyak" with empty string
    df[category_column] = df[category_column].apply(lambda x: x.replace('Lihat Lebih Banyak', '') if isinstance(x, str) else x)
    # Add category_list column
    df['category_list'] = df[category_column].apply(lambda x: x.split(',') if isinstance(x, str) else x)
    # Add product_name_clean
    df['product_name_clean'] = df[product_name_column].apply(lambda x: str(x).lower().strip())
    # Remove duplicates
    df = df.drop_duplicates(subset=['product_name_clean'], keep = 'last')
    # Left join with product query
    df_reference = reference_table.merge(df[['product_name_clean','category_list']], how='left', left_on=reference_table[reference_column].str.lower().str.strip(), right_on=df['product_name_clean'])
    # convert category_list that contains 'Unknown' to NaN
    df_reference['category_list'] = df_reference['category_list'].apply(lambda x: np.nan if isinstance(x, list) and 'Unknown' in x else x)
    # if the list in category_list contains empty string element, drop that element from the list
    df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i for i in x if i != ''] if isinstance(x, list) else x)
    # Choose final columns
    df_reference = df_reference[['Product Name', 'Product Name Clean', 'category_list']]
    # Strip
    df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i.strip() for i in x] if isinstance(x, list) else x)

    if split:
        return df_reference, df_reference.dropna(subset=['category_list'])
    else:
        return df_reference