Spaces:
Sleeping
Sleeping
File size: 4,287 Bytes
2c4cdb1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import numpy as np # Enables working with arrays and mathematical operations
import pandas as pd # Provides data structures and data analysis tools
import seaborn as sns; sns.set() # Enhances the visual appearance of plots and figures
from nltk.corpus import stopwords # Provides stopwords for natural language processing tasks
from nltk.stem import PorterStemmer # Implements the Porter stemming algorithm for word normalization
from nltk.tokenize import word_tokenize # Splits text into words or tokens
from sklearn.feature_extraction.text import TfidfVectorizer # Transforms text data into numerical features
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report # Evaluates model performance metrics
from sklearn.naive_bayes import MultinomialNB # Implements the Multinomial Naive Bayes classifier
from skmultilearn.problem_transform import ClassifierChain # Implements problem transformation techniques for multi-label classification
import matplotlib.pyplot as plt # Creates visualizations and plots
import pickle # Enables object serialization and deserialization
from sklearn.pipeline import Pipeline # Chains multiple steps into a single unit for machine learning workflows
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')
abstract_list_train = []
abstract_list_test = []
stemmer = PorterStemmer()
stop_words = stopwords.words('english')
#Remove StopWords and Stemming
def remove_stopwords(data = []):
data_list = []
for name in data:
words = word_tokenize(name)
stem_word = ""
for a in words:
if a.lower() not in stop_words:
stem_word += stemmer.stem(a) + ' '
data_list.append(stem_word.lower())
return data_list
#Remove Special Characters
def remove_special_character(data = []):
abstract_list_wo_sc = []
special_characters = '''!()-β[]{};:'"\, <>./?@#$%^&*_~0123456789+=ββ'''
for file in data:
word_wo_sc = ""
if len(file.split()) == 1:
abstract_list_wo_sc.append(file)
else:
for a in file:
if a in special_characters:
word_wo_sc += ' '
else:
word_wo_sc += a
abstract_list_wo_sc.append(word_wo_sc)
return abstract_list_wo_sc
#Remove stopwords from Train Data
data_train = np.array(train_data['ABSTRACT'])
abstract_list_train = remove_stopwords(data_train)
#Remove stopwords from Test Data
data_test = np.array(test_data['ABSTRACT'])
abstract_list_test = remove_stopwords(data_test)
#Removing speaial characters from Train Data and Test Data
abstract_list_wo_sc_train = remove_special_character(abstract_list_train)
abstract_list_wo_sc_test = remove_special_character(abstract_list_test)
categories=['Engineering', 'Business', 'Art']
x_train = abstract_list_wo_sc_train
y_train = train_data[categories]
x_test = abstract_list_wo_sc_test
y_test = test_data[categories]
print("There are ", len(x_train), " input training samples")
print("There are ", len(x_test), " input testing samples")
print("There are ", y_train.shape, " output training samples")
print("There are ", y_test.shape, " output testing samples")
# defining parameters for pipeline
parameters = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),('clf', ClassifierChain(MultinomialNB())),])
# train data
parameters.fit(x_train, y_train)
# predict
predictions = parameters.predict(x_test)
# Print accuracy score
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)
# Print F1 score
f1 = f1_score(y_test, predictions, average='micro')
print('F1 score:', f1)
# Print classification report
report = classification_report(y_test, predictions)
print('Classification Report:')
print(report)
# Confusion Matrix and HeatMap Generation
mat = confusion_matrix(y_test.values.argmax(axis=1), predictions.argmax(axis=1))
plt.figure(figsize=(8, 6))
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='Blues')
plt.xlabel('True Label')
plt.ylabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()
# Save as picklefile
with open('model_MultiNB.pkl', 'wb') as picklefile:
pickle.dump(parameters.named_steps['clf'], picklefile)
|