import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import yaml import os import warnings from rapidfuzz import fuzz, utils from simpletransformers.classification import ClassificationModel, ClassificationArgs from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from scipy.special import softmax def generate_training_data(df, text_column, label_column, external_table = None, external_column = None, add_external_table=False, sampling=True): """ This function generates training data for the model. :param df: pandas.DataFrame, dataframe containing product name and category name :param text_column: str, column name containing product name :param label_column: str, column name containing category name :param external_table: pandas.DataFrame, dataframe containing product name and category name :param external_column: str, column name containing product name :param add_external_table: bool, whether to add external table or not :param sampling: bool, whether to do sampling or not :return: pandas.DataFrame, dataframe containing product name and category name """ if os.listdir('training') == []: print('Training folder is empty. Generating training data...') units = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['excluded_words'] df['category_name'] = df[label_column].apply(lambda x: 'Fertilizer - High' if isinstance(x, list) and len(x) == 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - High' if isinstance(x, list) and len(x) == 1 and 'Weeds & Pest Control' in x else 'Fertilizer - Medium' if isinstance(x, list) and len(x) > 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - Medium' if isinstance(x, list) and len(x) > 1 and 'Weeds & Pest Control' in x else 'Others') df = df[[text_column, 'category_name']] # take only where category_name is Ferilizer - High or Pesticide - High or Others df = df[df['category_name'].isin(['Fertilizer - High', 'Pesticide - High', 'Others'])] # exclude product name that contains units AND category_name is Others df = df[~(df[text_column].str.contains('|'.join(units)) & (df['category_name'] == 'Others'))] if add_external_table: external_table['category_name'] = 'Fertilizer - High' external_table = external_table[[external_column, 'category_name']] external_table.columns = [text_column, 'category_name'] training_df = pd.concat([external_table, df]) training_df.columns = ['product_name','category_name'] training_df['category_name'] = training_df['category_name'].apply(lambda x: 0 if x == 'Fertilizer - High' else 1 if x == 'Pesticide - High' else 2) if sampling: return pd.concat([training_df[training_df['category_name'] == 0].sample(n=1250), training_df[training_df['category_name'] == 1].sample(n=1250), training_df[training_df['category_name'] == 2].sample(n=1500)]) else: return training_df else: return df else: training_df = pd.read_csv('training/training_data.csv') return training_df def category_reassign(row, reference_df, checked_category, threshold=70): """ This function reassigns the category name of a product based on the similarity score between the product name and the reference dataframe. :param row: pandas.Series, row of dataframe :param reference_df: pandas.DataFrame, dataframe containing product name and category name :param checked_category: str, category name to be checked :param threshold: int, threshold for similarity score :return: str, category name """ if row['category_name'] == checked_category: for i in range(len(reference_df)): row2 = reference_df.iloc[i] if row2['category_name'] != checked_category: if fuzz.ratio(row['product_name'], row2['product_name'], processor= utils.default_process) >= threshold: return row2['category_name'] return checked_category else: return row['category_name'] def train_model(df, train_type, label_column, stratify=True, model_type='bert', use_existing_model=False, model_name=None): """ This function trains the model using the configuration in config.yaml :param df: pandas.DataFrame, dataframe containing product name and category name :param stratify: bool, whether to do stratified sampling or not :param model_type: str, type of model to use :param use_existing_model: bool, whether to use existing model or not :param model_name: str, name of existing model :return: simpletransformers.classification.ClassificationModel, model :return: numpy.ndarray, predictions :return: str, classification report :return: pandas.DataFrame, training dataframe :return: pandas.DataFrame, testing dataframe :return: list, list of class names """ warnings.filterwarnings('ignore') test_size = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['training_args']['test_size'] train_df, test_df = train_test_split(df, test_size=test_size, stratify=df[label_column]) # Optional model configuration model_config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_args'] model_args = ClassificationArgs() model_args.num_train_epochs = model_config['num_train_epochs'] model_args.train_batch_size = model_config['train_batch_size'] model_args.eval_batch_size = model_config['eval_batch_size'] model_args.overwrite_output_dir = model_config['overwrite_output_dir'] model_args.fp16 = model_config['fp16'] model_args.do_lower_case = model_config['do_lower_case'] # Create a ClassificationModel model_detail = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_types'] class_names = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['class_names'][train_type] if use_existing_model: model = ClassificationModel(model_type, model_name, num_labels=len(class_names), args=model_args, use_cuda=False) else: model = ClassificationModel(model_type, model_detail[model_type], num_labels=len(class_names), args=model_args, use_cuda=False) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(test_df) preds = np.argmax(model_outputs, axis=1) class_report =classification_report(test_df[label_column], preds, target_names=class_names) return model, preds, class_report, train_df, test_df, class_names def save_model(model, model_name): """ This function saves the model. :param model: simpletransformers.classification.ClassificationModel, model :param model_name: str, name of model :return: None """ model.model.save_pretrained(model_name) model.tokenizer.save_pretrained(model_name) model.config.save_pretrained(model_name + '/') print('Model saved to ' + model_name + '/') def show_confusion_matrix(test_category, preds, class_names): """ This function shows the confusion matrix. :param test_category: numpy.ndarray, array of category name :param preds: numpy.ndarray, array of predictions :param class_names: list, list of class names :return: matplotlib.axes._subplots.AxesSubplot, confusion matrix """ cm = confusion_matrix(test_category, preds) df_cm = pd.DataFrame(cm, index=class_names, columns=class_names) hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues") hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right') hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right') plt.ylabel('True Topics') plt.xlabel('Predicted Topics') def predict_proba(model,text): """ This function predicts the probability of each class (in a text form). :param model: simpletransformers.classification.ClassificationModel, model :param text: str, text to predict :return: numpy.ndarray, array of probabilities """ proba = softmax(model.predict([text])[1])[0] print('-----------------------------') print('Text to Predict: ', text) print('Probability of each class:') print('Fertilizer: ', proba[0]) print('Pesticide: ', proba[1]) print('Others: ', proba[2]) def predict_proba_array(model,text): """ This function predicts the probability of each class (in an array form). :param model: simpletransformers.classification.ClassificationModel, model :param text: str, text to predict :return: numpy.ndarray, array of probabilities """ proba = softmax(model.predict([text])[1])[0] return proba