Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
from rapidfuzz import process, fuzz, utils | |
def clean_dataframe(df, column, remove_na=True, remove_non_words=True, remove_symbols=True, remove_duplicates=True): | |
""" | |
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates. | |
Parameters: | |
df (pandas.DataFrame): The dataframe to clean. | |
column (str): The column to clean. | |
remove_na (bool): Whether to remove NaN or not. | |
remove_non_words (bool): Whether to remove non-words or not. | |
remove_symbols (bool): Whether to remove symbols or not. | |
remove_duplicates (bool): Whether to remove duplicates or not. | |
Returns: | |
pandas.DataFrame: The cleaned dataframe. | |
""" | |
# Lowercase the column | |
df[column + ' Clean'] = df[column].apply(lambda x: str(x).lower()) | |
# Remove non words (symbols, numbers, etc.) | |
if remove_non_words: | |
df[column + ' Clean'] = '' | |
for i in range(len(df)): | |
row = df.iloc[i] | |
clean_word_list = [] | |
for word in str(row[column]).lower().split(): | |
if not any(char.isdigit() for char in word): | |
clean_word_list.append(word) | |
df.at[i, column + ' Clean'] = ' '.join(clean_word_list) | |
# Remove symbols, but keep numbers | |
if remove_symbols: | |
df[column + ' Clean'] = df[column + ' Clean'].apply(lambda x: ''.join(letter for letter in x if letter.isalnum() or letter.isspace())) | |
# Drop if the new column is NaN or empty string (when the whitespace is removed, it is '') | |
if remove_na: | |
df = df[df[column + ' Clean'].notna()] | |
df = df[df[column + ' Clean'].replace(' ','') != ''] | |
# Remove duplicates | |
if remove_duplicates: | |
df = df.drop_duplicates(subset=[column + ' Clean']) | |
return df | |
def fuzzy_join(row, df_reference, column_reference, column_matched_to, take_regist_number=False, take_source = False, set_ratio_weight=0.5, ratio_weight=0.5): | |
""" | |
This function applies fuzzy join to the given row and returns the matched product name and nomor pendaftaran | |
based on the maximum similarity score between the two columns. | |
Parameters: | |
row (pandas.Series): The row to apply fuzzy join on. | |
df_reference (pandas.DataFrame): The dataframe to compare with. | |
column_reference (str): The column to use for fuzzy join. | |
column_matched_to (str): The column to compare with. | |
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset. | |
set_ratio_weight (int): The weight to set for the ratio-based similarity metric. | |
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics. | |
Returns: | |
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran. | |
""" | |
similar_product_name = '' | |
similarity_score = 0 | |
nomor_pendaftaran = '' | |
source = '' | |
for product_name in df_reference[column_reference]: | |
if set_ratio_weight == 0: | |
score = fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process) | |
elif ratio_weight == 0: | |
score = fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) | |
else: | |
score = set_ratio_weight * fuzz.token_set_ratio(product_name, row[column_matched_to], processor=utils.default_process) + ratio_weight * fuzz.ratio(product_name.lower(), row[column_matched_to].lower(), processor=utils.default_process) | |
if score > similarity_score: | |
similarity_score = score | |
similar_product_name = product_name | |
if take_regist_number: | |
nomor_pendaftaran = df_reference[df_reference[column_reference] == product_name]['Nomor Pendaftaran'].iloc[0] | |
if take_source: | |
source = df_reference[df_reference[column_reference] == product_name]['Source'].iloc[0] | |
if take_regist_number and take_source: | |
return similar_product_name, similarity_score, nomor_pendaftaran, source | |
elif take_regist_number: | |
return similar_product_name, similarity_score, nomor_pendaftaran | |
elif take_source: | |
return similar_product_name, similarity_score, source | |
else: | |
return similar_product_name, similarity_score | |
def fuzzy_join_compare(df, first_column, second_column, registered_fertilizers, take_regist_number=True, set_ratio_weight=1, ratio_weight=0): | |
""" | |
This function applies fuzzy join to the given dataframe and returns the matched product name and nomor pendaftaran | |
based on the maximum similarity score between the two columns. | |
Parameters: | |
df (pandas.DataFrame): The dataframe to apply fuzzy join on. | |
first_column (str): The first column to use for fuzzy join. | |
second_column (str): The second column to compare with. | |
registered_fertilizers (pandas.DataFrame): The dataframe containing the registered fertilizers. | |
take_regist_number (bool): Whether to take the nomor pendaftaran from the registered fertilizer dataset. | |
set_ratio_weight (int): The weight to set for the ratio-based similarity metric. | |
ratio_weight (int): The weight to set for the weighted average of the two similarity metrics. | |
Returns: | |
pandas.DataFrame: The input dataframe with additional columns for matched product name and nomor pendaftaran. | |
""" | |
df['Matched Product Name 1'], df['Similarity Score 1'], df['Nomor Pendaftaran 1'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', first_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1)) | |
df['Matched Product Name 2'], df['Similarity Score 2'], df['Nomor Pendaftaran 2'] = zip(*df.apply(lambda row: fuzzy_join(row, registered_fertilizers, 'Nama Lengkap', second_column, take_regist_number=take_regist_number, set_ratio_weight=set_ratio_weight, ratio_weight=ratio_weight), axis=1)) | |
# Take the maximum similarity score and take the matched product name and nomor pendaftaran based on that | |
df['Max Similarity Score'] = df[['Similarity Score 1', 'Similarity Score 2']].max(axis=1) | |
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the matched product name 1 as matched product name, else take matched product name 2 | |
df['Matched Product Name'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Matched Product Name 1'], df['Matched Product Name 2']) | |
# If condition: if similarity score 1 is higher than equal to similarity score 2, take the nomor pendaftaran 1 as nomor pendaftaran, else take nomor pendaftaran 2 | |
df['Nomor Pendaftaran'] = np.where(df['Similarity Score 1'] >= df['Similarity Score 2'], df['Nomor Pendaftaran 1'], df['Nomor Pendaftaran 2']) | |
# Remove the columns that are no longer needed such as the matched product name 1 and 2, similarity score 1 and 2, and nomor pendaftaran 1 and 2 | |
df.drop(columns=['Matched Product Name 1', 'Matched Product Name 2', 'Similarity Score 1', 'Similarity Score 2', 'Nomor Pendaftaran 1', 'Nomor Pendaftaran 2'], inplace=True) | |
return df | |
def slice_with_filter(df, column, ref_df, use_filter=False, filter_condition=None): | |
""" | |
This function slices the given dataframe based on the given reference dataframe. | |
:param df: pandas.DataFrame, dataframe to be sliced | |
:param column: str, column to be sliced | |
:param ref_df: pandas.DataFrame, reference dataframe | |
:param use_filter: bool, whether to use filter or not | |
:param filter_condition: str, filter condition | |
:return: pandas.DataFrame, sliced dataframe | |
""" | |
if use_filter: | |
ref_df = ref_df[filter_condition] | |
return df[~df[column].isin(ref_df[column].to_list())] | |
def combine_catalog(column_1, column_2, source_1, source_2): | |
""" | |
This function combines two columns into one dataframe. | |
:param column_1: pandas.Series, first column | |
:param column_2: pandas.Series, second column | |
:param source_1: str, source of first column | |
:param source_2: str, source of second column | |
:return: pandas.DataFrame, combined dataframe | |
""" | |
combined_catalog = pd.concat([column_1, column_2]) | |
combined_catalog = combined_catalog.to_frame(name='Registered Product') | |
combined_catalog['Source'] = pd.concat([column_1.apply(lambda x: source_1), column_2.apply(lambda x: source_2)]) | |
combined_catalog.reset_index(drop=True, inplace=True) | |
return combined_catalog | |
def clean_category_dataframe(df, category_column, product_name_column, reference_table, reference_column, split=False): | |
""" | |
This function cleans the given dataframe by removing NaN, non-words, symbols, and duplicates. | |
Parameters: | |
df (pandas.DataFrame): The dataframe to clean. | |
category_column (str): The column containing category name. | |
product_name_column (str): The column containing product name. | |
reference_table (pandas.DataFrame): The reference table to be used for fuzzy join. | |
reference_column (str): The column to be used for fuzzy join. | |
split (bool): Whether to split the dataframe into two or not. | |
Returns: | |
pandas.DataFrame: The cleaned dataframe. | |
""" | |
# If column does not contain "Category", fill it with "Unknown" (including those that are NaN) | |
df[category_column] = df[category_column].apply(lambda x: x if isinstance(x, str) and 'Category' in x else 'Unknown') | |
# If column contains "Category", remove the word "Category" and replace "\n" with "," | |
df[category_column] = df[category_column].apply(lambda x: x.replace('Category', '').replace('\n', ',') if isinstance(x, str) else x) | |
# Replace "Lihat Lebih Banyak" with empty string | |
df[category_column] = df[category_column].apply(lambda x: x.replace('Lihat Lebih Banyak', '') if isinstance(x, str) else x) | |
# Add category_list column | |
df['category_list'] = df[category_column].apply(lambda x: x.split(',') if isinstance(x, str) else x) | |
# Add product_name_clean | |
df['product_name_clean'] = df[product_name_column].apply(lambda x: str(x).lower().strip()) | |
# Remove duplicates | |
df = df.drop_duplicates(subset=['product_name_clean'], keep = 'last') | |
# Left join with product query | |
df_reference = reference_table.merge(df[['product_name_clean','category_list']], how='left', left_on=reference_table[reference_column].str.lower().str.strip(), right_on=df['product_name_clean']) | |
# convert category_list that contains 'Unknown' to NaN | |
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: np.nan if isinstance(x, list) and 'Unknown' in x else x) | |
# if the list in category_list contains empty string element, drop that element from the list | |
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i for i in x if i != ''] if isinstance(x, list) else x) | |
# Choose final columns | |
df_reference = df_reference[['Product Name', 'Product Name Clean', 'category_list']] | |
# Strip | |
df_reference['category_list'] = df_reference['category_list'].apply(lambda x: [i.strip() for i in x] if isinstance(x, list) else x) | |
if split: | |
return df_reference, df_reference.dropna(subset=['category_list']) | |
else: | |
return df_reference |