Spaces:
Runtime error
Runtime error
File size: 11,350 Bytes
b279c69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz, utils
def clean_dataframe(df, column, remove_na=True, remove_non_words=True, remove_symbols=True, remove_duplicates=True):
"""
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
Parameters:
df (pandas.DataFrame): The dataframe to clean.
column (str): The column to clean.
remove_na (bool): Whether to remove NaN or not.
remove_non_words (bool): Whether to remove non-words or not.
remove_symbols (bool): Whether to remove symbols or not.
remove_duplicates (bool): Whether to remove duplicates or not.
Returns:
pandas.DataFrame: The cleaned dataframe.
"""
# Lowercase the column
df[column + ' Clean'] = df[column].apply(lambda x: str(x).lower())
# Remove non words (symbols, numbers, etc.)
if remove_non_words:
df[column + ' Clean'] = ''
for i in range(len(df)):
row = df.iloc[i]
clean_word_list = []
for word in str(row[column]).lower().split():
if not any(char.isdigit() for char in word):
clean_word_list.append(word)
df.at[i, column + ' Clean'] = ' '.join(clean_word_list)
# Remove symbols, but keep numbers
if remove_symbols:
df[column + ' Clean'] = df[column + ' Clean'].apply(lambda x: ''.join(letter for letter in x if letter.isalnum() or letter.isspace()))
# Drop if the new column is NaN or empty string (when the whitespace is removed, it is '')
if remove_na:
df = df[df[column + ' Clean'].notna()]
df = df[df[column + ' Clean'].replace(' ','') != '']
# Remove duplicates
if remove_duplicates:
df = df.drop_duplicates(subset=[column + ' Clean'])
return df
def fuzzy_join(row, df_reference, column_reference, column_matched_to, take_regist_number=False, take_source = False, set_ratio_weight=0.5, ratio_weight=0.5):
"""
This function applies fuzzy join to the given row and returns the matched product name and nomor pendaftaran
based on the maximum similarity score between the two columns.
Parameters:
row (pandas.Series): The row to apply fuzzy join on.
df_reference (pandas.DataFrame): The dataframe to compare with.
column_reference (str): The column to use for fuzzy join.
column_matched_to (str): The column to compare with.
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
Returns:
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
"""
similar_product_name = ''
similarity_score = 0
nomor_pendaftaran = ''
source = ''
for product_name in df_reference[column_reference]:
if set_ratio_weight == 0:
score = fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
elif ratio_weight == 0:
score = fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process)
else:
score = set_ratio_weight * fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) + ratio_weight * fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process)
if score > similarity_score:
similarity_score = score
similar_product_name = product_name
if take_regist_number:
nomor_pendaftaran = df_reference[df_reference[column_reference] == product_name]['Nomor Pendaftaran'].iloc[0]
if take_source:
source = df_reference[df_reference[column_reference] == product_name]['Source'].iloc[0]
if take_regist_number and take_source:
return similar_product_name, similarity_score, nomor_pendaftaran, source
elif take_regist_number:
return similar_product_name, similarity_score, nomor_pendaftaran
elif take_source:
return similar_product_name, similarity_score, source
else:
return similar_product_name, similarity_score
def fuzzy_join_compare(df, first_column, second_column, registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0):
"""
This function applies fuzzy join to the given dataframe and returns the matched product name and nomor pendaftaran
based on the maximum similarity score between the two columns.
Parameters:
df (pandas.DataFrame): The dataframe to apply fuzzy join on.
first_column (str): The first column to use for fuzzy join.
second_column (str): The second column to compare with.
registered_fertilizers (pandas.DataFrame): The dataframe containing the registered fertilizers.
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset.
set_ratio_weight (int): The weight to set for the ratio-based similarity metric.
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics.
Returns:
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran.
"""
df['Matched Product Name 1'], df['Similarity Score 1'], df['Nomor Pendaftaran 1'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', first_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
df['Matched Product Name 2'], df['Similarity Score 2'], df['Nomor Pendaftaran 2'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', second_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1))
# Take the maximum similarity score and take the matched product name and nomor pendaftaran based on that
df['Max Similarity Score'] = df[['Similarity Score 1', 'Similarity Score 2']].max(axis=1)
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the matched product name 1 as matched product name, else take matched product name 2
df['Matched Product Name'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Matched Product Name 1'], df['Matched Product Name 2'])
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the nomor pendaftaran 1 as nomor pendaftaran, else take nomor pendaftaran 2
df['Nomor Pendaftaran'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Nomor Pendaftaran 1'], df['Nomor Pendaftaran 2'])
# Remove the columns that are no longer needed such as the matched product name 1 and 2, similarity score 1 and 2, and nomor pendaftaran 1 and 2
df.drop(columns=['Matched Product Name 1', 'Matched Product Name 2', 'Similarity Score 1', 'Similarity Score 2', 'Nomor Pendaftaran 1', 'Nomor Pendaftaran 2'], inplace=True)
return df
def slice_with_filter(df, column, ref_df, use_filter=False, filter_condition=None):
"""
This function slices the given dataframe based on the given reference dataframe.
:param df: pandas.DataFrame, dataframe to be sliced
:param column: str, column to be sliced
:param ref_df: pandas.DataFrame, reference dataframe
:param use_filter: bool, whether to use filter or not
:param filter_condition: str, filter condition
:return: pandas.DataFrame, sliced dataframe
"""
if use_filter:
ref_df = ref_df[filter_condition]
return df[~df[column].isin(ref_df[column].to_list())]
def combine_catalog(column_1, column_2, source_1, source_2):
"""
This function combines two columns into one dataframe.
:param column_1: pandas.Series, first column
:param column_2: pandas.Series, second column
:param source_1: str, source of first column
:param source_2: str, source of second column
:return: pandas.DataFrame, combined dataframe
"""
combined_catalog = pd.concat([column_1, column_2])
combined_catalog = combined_catalog.to_frame(name='Registered Product')
combined_catalog['Source'] = pd.concat([column_1.apply(lambda x: source_1), column_2.apply(lambda x: source_2)])
combined_catalog.reset_index(drop=True, inplace=True)
return combined_catalog
def clean_category_dataframe(df, category_column, product_name_column, reference_table, reference_column, split=False):
"""
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates.
Parameters:
df (pandas.DataFrame): The dataframe to clean.
category_column (str): The column containing category name.
product_name_column (str): The column containing product name.
reference_table (pandas.DataFrame): The reference table to be used for fuzzy join.
reference_column (str): The column to be used for fuzzy join.
split (bool): Whether to split the dataframe into two or not.
Returns:
pandas.DataFrame: The cleaned dataframe.
"""
# If column does not contain "Category", fill it with "Unknown" (including those that are NaN)
df[category_column] = df[category_column].apply(lambda x: x if isinstance(x, str) and 'Category' in x else 'Unknown')
# If column contains "Category", remove the word "Category" and replace "\n" with ","
df[category_column] = df[category_column].apply(lambda x: x.replace('Category', '').replace('\n', ',') if isinstance(x, str) else x)
# Replace "Lihat Lebih Banyak" with empty string
df[category_column] = df[category_column].apply(lambda x: x.replace('Lihat Lebih Banyak', '') if isinstance(x, str) else x)
# Add category_list column
df['category_list'] = df[category_column].apply(lambda x: x.split(',') if isinstance(x, str) else x)
# Add product_name_clean
df['product_name_clean'] = df[product_name_column].apply(lambda x: str(x).lower().strip())
# Remove duplicates
df = df.drop_duplicates(subset=['product_name_clean'], keep = 'last')
# Left join with product query
df_reference = reference_table.merge(df[['product_name_clean','category_list']], how='left', left_on=reference_table[reference_column].str.lower().str.strip(), right_on=df['product_name_clean'])
# convert category_list that contains 'Unknown' to NaN
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: np.nan if isinstance(x, list) and 'Unknown' in x else x)
# if the list in category_list contains empty string element, drop that element from the list
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i for i in x if i != ''] if isinstance(x, list) else x)
# Choose final columns
df_reference = df_reference[['Product Name', 'Product Name Clean', 'category_list']]
# Strip
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i.strip() for i in x] if isinstance(x, list) else x)
if split:
return df_reference, df_reference.dropna(subset=['category_list'])
else:
return df_reference |