import numpy as np # For linear algebra import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt # For Visualisation import seaborn as sns # For Visualisation from bs4 import BeautifulSoup # For Text Parsing # # IMPORTING DATASET data = pd.read_csv('Reviews.csv') # data # # DATA PREPROCESSING & VISUALISATION #data.isnull().sum() data=data.dropna() #data.isnull().sum() #data.shape score_unique = data['Score'].unique() #print(score_unique) # 0-> NEGATIVE REVIEW # 1-> NEUTRAL REVIEW # 2-> POSTIVE REVIEW a=[] for i in data['Score']: if i <3: a.append(0) if i==3: a.append(1) if i>3: a.append(2) r_0, r_1, r_2 = 0, 0, 0 for i in a: if i == 0: r_0 += 1 elif i == 1: r_1 += 1 else: r_2 += 1 # print('Negative Reviews:',r_0) # print('Neutral Reviews:',r_1) # print('Positive Reviews:',r_2) # sns.countplot(a) # plt.xlabel('Reviews', color = 'red') # plt.ylabel('Count', color = 'red') # plt.xticks([0,1,2],['Negative','Neutral','Positive']) # plt.title('COUNT PLOT', color = 'r') # plt.show() data['sentiment']=a #data final_dataset = data[['Text','sentiment']] #final_dataset data_p=final_dataset[data['sentiment']==2] data_n=final_dataset[data['sentiment']==0] #len(data_p), len(data_n) datap = data_p.iloc[np.random.randint(1,443766,5000), :] datan = data_n.iloc[np.random.randint(1, 82007,5000), :] #len(datan), len(datap) data = pd.concat([datap,datan]) len(data) c=[] for i in data['sentiment']: if i==0: c.append(0) if i==2: c.append(1) data['sentiment']=c def strip_html(text): soup = BeautifulSoup(text, "html.parser") return soup.get_text() data['review'] = data['Text'].apply(strip_html) data=data.drop('Text',axis=1) #data.head() import nltk #Natural Language Processing Toolkit def punc_clean(text): import string as st a=[w for w in text if w not in st.punctuation] return ''.join(a) data['review'] = data['review'].apply(punc_clean) #data.head(2) def remove_stopword(text): stopword=nltk.corpus.stopwords.words('english') stopword.remove('not') a=[w for w in nltk.word_tokenize(text) if w not in stopword] return ' '.join(a) #data['review'] = data['review'].apply(remove_stopword) from sklearn.feature_extraction.text import TfidfVectorizer vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1) vectr.fit(data['review']) vect_X = vectr.transform(data['review']) from sklearn.linear_model import LogisticRegression model = LogisticRegression() clf=model.fit(vect_X,data['sentiment']) #clf.score(vect_X,data['sentiment'])*100 # # PREDICTION # clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more .. I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that, Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.'''])) # clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped'''])) # clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))