File size: 8,922 Bytes
b279c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import os
import warnings
from rapidfuzz import fuzz, utils
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from scipy.special import softmax


def generate_training_data(df, text_column, label_column, external_table = None, external_column = None, add_external_table=False, sampling=True):
    """
    This function generates training data for the model.

    :param df: pandas.DataFrame, dataframe containing product name and category name
    :param text_column: str, column name containing product name
    :param label_column: str, column name containing category name
    :param external_table: pandas.DataFrame, dataframe containing product name and category name
    :param external_column: str, column name containing product name
    :param add_external_table: bool, whether to add external table or not
    :param sampling: bool, whether to do sampling or not

    :return: pandas.DataFrame, dataframe containing product name and category name
    """
    if os.listdir('training') == []:
        print('Training folder is empty. Generating training data...')
        units = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['excluded_words']
        
        df['category_name'] = df[label_column].apply(lambda x: 'Fertilizer - High' if isinstance(x, list) and len(x) == 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - High' if isinstance(x, list) and len(x) == 1 and 'Weeds & Pest Control' in x else 'Fertilizer - Medium' if isinstance(x, list) and len(x) > 1 and 'Garden Soil & Fertilizers' in x else 'Pesticide - Medium' if isinstance(x, list) and len(x) > 1 and 'Weeds & Pest Control' in x else 'Others')
        df = df[[text_column, 'category_name']]
        
        # take only where category_name is Ferilizer - High or Pesticide - High or Others
        df = df[df['category_name'].isin(['Fertilizer - High', 'Pesticide - High', 'Others'])]
        # exclude product name that contains units AND category_name is Others
        df = df[~(df[text_column].str.contains('|'.join(units)) & (df['category_name'] == 'Others'))]

        if add_external_table:
            external_table['category_name'] = 'Fertilizer - High'
            external_table = external_table[[external_column, 'category_name']]
            external_table.columns = [text_column, 'category_name']

            training_df = pd.concat([external_table, df])
            training_df.columns = ['product_name','category_name']

            training_df['category_name'] = training_df['category_name'].apply(lambda x: 0 if x == 'Fertilizer - High' else 1 if x == 'Pesticide - High' else 2)
            if sampling:
                return pd.concat([training_df[training_df['category_name'] == 0].sample(n=1250), training_df[training_df['category_name'] == 1].sample(n=1250), training_df[training_df['category_name'] == 2].sample(n=1500)])
            else:
                return training_df
        else:
            return df
    else:
        training_df = pd.read_csv('training/training_data.csv')
        return training_df

def category_reassign(row, reference_df, checked_category, threshold=70):
    """
    This function reassigns the category name of a product based on the similarity score between the product name and the reference dataframe.

    :param row: pandas.Series, row of dataframe
    :param reference_df: pandas.DataFrame, dataframe containing product name and category name
    :param checked_category: str, category name to be checked
    :param threshold: int, threshold for similarity score

    :return: str, category name
    """
    if row['category_name'] == checked_category:
        for i in range(len(reference_df)):
            row2 = reference_df.iloc[i]
            if row2['category_name'] != checked_category:
                if fuzz.ratio(row['product_name'], row2['product_name'], processor= utils.default_process) >= threshold:
                    return row2['category_name']
        return checked_category
    else:
        return row['category_name']
    
def train_model(df, stratify=True, model_type='bert', use_existing_model=False, model_name=None):
    """
    This function trains the model using the configuration in config.yaml

    :param df: pandas.DataFrame, dataframe containing product name and category name
    :param stratify: bool, whether to do stratified sampling or not
    :param model_type: str, type of model to use
    :param use_existing_model: bool, whether to use existing model or not
    :param model_name: str, name of existing model

    :return: simpletransformers.classification.ClassificationModel, model
    :return: numpy.ndarray, predictions
    :return: str, classification report
    :return: pandas.DataFrame, training dataframe
    :return: pandas.DataFrame, testing dataframe
    :return: list, list of class names
    """
    warnings.filterwarnings('ignore')
    
    test_size = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['training_args']['test_size']
    train_df, test_df = train_test_split(df, test_size=test_size, stratify=df['category_name'])

    # Optional model configuration  
    model_config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_args']
    model_args = ClassificationArgs()
    model_args.num_train_epochs = model_config['num_train_epochs']
    model_args.train_batch_size = model_config['train_batch_size']
    model_args.eval_batch_size = model_config['eval_batch_size']
    model_args.overwrite_output_dir = model_config['overwrite_output_dir']
    model_args.fp16 = model_config['fp16']
    model_args.do_lower_case = model_config['do_lower_case']

    # Create a ClassificationModel
    model_detail = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['model_types']
    class_names = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['parameters']['class_names']

    if use_existing_model:
        model = ClassificationModel(model_type, model_name, num_labels=len(class_names), args=model_args, use_cuda=False)
    else:
        model = ClassificationModel(model_type, model_detail[model_type], num_labels=len(class_names), args=model_args, use_cuda=False)

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_df)
    preds = np.argmax(model_outputs, axis=1)
    class_report =classification_report(test_df['category_name'], preds, target_names=class_names)

    return model, preds, class_report, train_df, test_df, class_names

def save_model(model, model_name):
    """
    This function saves the model.

    :param model: simpletransformers.classification.ClassificationModel, model
    :param model_name: str, name of model

    :return: None
    """
    model.model.save_pretrained(model_name)
    model.tokenizer.save_pretrained(model_name)
    model.config.save_pretrained(model_name + '/')
    print('Model saved to ' + model_name + '/')

def show_confusion_matrix(test_category, preds, class_names):
    """
    This function shows the confusion matrix.

    :param test_category: numpy.ndarray, array of category name
    :param preds: numpy.ndarray, array of predictions
    :param class_names: list, list of class names

    :return: matplotlib.axes._subplots.AxesSubplot, confusion matrix
    """
    cm = confusion_matrix(test_category, preds)
    df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
    hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True Topics')
    plt.xlabel('Predicted Topics')

def predict_proba(model,text):
    """
    This function predicts the probability of each class (in a text form).

    :param model: simpletransformers.classification.ClassificationModel, model
    :param text: str, text to predict

    :return: numpy.ndarray, array of probabilities
    """
    proba = softmax(model.predict([text])[1])[0]
    print('-----------------------------')
    print('Text to Predict: ', text)
    print('Probability of each class:')
    print('Fertilizer: ', proba[0])
    print('Pesticide: ', proba[1])
    print('Others: ', proba[2])

def predict_proba_array(model,text):
    """
    This function predicts the probability of each class (in an array form).

    :param model: simpletransformers.classification.ClassificationModel, model
    :param text: str, text to predict

    :return: numpy.ndarray, array of probabilities
    """
    proba = softmax(model.predict([text])[1])[0]
    return proba