Spaces:
Build error
Build error
[clean]: removed chunks
Browse files- sentiment.py +3 -99
sentiment.py
CHANGED
@@ -1,51 +1,28 @@
|
|
1 |
import numpy as np # For linear algebra
|
2 |
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
|
3 |
import matplotlib.pyplot as plt # For Visualisation
|
4 |
-
# get_ipython().run_line_magic('matplotlib', 'inline')
|
5 |
import seaborn as sns # For Visualisation
|
6 |
from bs4 import BeautifulSoup # For Text Parsing
|
7 |
|
8 |
|
9 |
# # IMPORTING DATASET
|
10 |
|
11 |
-
# In[2]:
|
12 |
-
|
13 |
-
|
14 |
data = pd.read_csv('Reviews.csv')
|
15 |
# data
|
16 |
|
17 |
|
18 |
# # DATA PREPROCESSING & VISUALISATION
|
19 |
|
20 |
-
# In[3]:
|
21 |
-
|
22 |
-
|
23 |
#data.isnull().sum()
|
24 |
|
25 |
-
|
26 |
-
# In[4]:
|
27 |
-
|
28 |
-
|
29 |
data=data.dropna()
|
30 |
#data.isnull().sum()
|
31 |
|
32 |
-
|
33 |
-
# In[5]:
|
34 |
-
|
35 |
-
|
36 |
#data.shape
|
37 |
|
38 |
-
|
39 |
-
# In[6]:
|
40 |
-
|
41 |
-
|
42 |
score_unique = data['Score'].unique()
|
43 |
#print(score_unique)
|
44 |
|
45 |
-
|
46 |
-
# In[7]:
|
47 |
-
|
48 |
-
|
49 |
# 0-> NEGATIVE REVIEW
|
50 |
# 1-> NEUTRAL REVIEW
|
51 |
# 2-> POSTIVE REVIEW
|
@@ -58,10 +35,6 @@ for i in data['Score']:
|
|
58 |
if i>3:
|
59 |
a.append(2)
|
60 |
|
61 |
-
|
62 |
-
# In[8]:
|
63 |
-
|
64 |
-
|
65 |
r_0, r_1, r_2 = 0, 0, 0
|
66 |
for i in a:
|
67 |
if i == 0:
|
@@ -75,10 +48,6 @@ for i in a:
|
|
75 |
# print('Neutral Reviews:',r_1)
|
76 |
# print('Positive Reviews:',r_2)
|
77 |
|
78 |
-
|
79 |
-
# In[9]:
|
80 |
-
|
81 |
-
|
82 |
# sns.countplot(a)
|
83 |
# plt.xlabel('Reviews', color = 'red')
|
84 |
# plt.ylabel('Count', color = 'red')
|
@@ -86,42 +55,22 @@ for i in a:
|
|
86 |
# plt.title('COUNT PLOT', color = 'r')
|
87 |
# plt.show()
|
88 |
|
89 |
-
|
90 |
-
# In[10]:
|
91 |
-
|
92 |
-
|
93 |
data['sentiment']=a
|
94 |
#data
|
95 |
final_dataset = data[['Text','sentiment']]
|
96 |
#final_dataset
|
97 |
|
98 |
-
|
99 |
-
# In[11]:
|
100 |
-
|
101 |
-
|
102 |
data_p=final_dataset[data['sentiment']==2]
|
103 |
data_n=final_dataset[data['sentiment']==0]
|
104 |
#len(data_p), len(data_n)
|
105 |
|
106 |
-
|
107 |
-
# In[12]:
|
108 |
-
|
109 |
-
|
110 |
datap = data_p.iloc[np.random.randint(1,443766,5000), :]
|
111 |
datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
|
112 |
#len(datan), len(datap)
|
113 |
|
114 |
-
|
115 |
-
# In[13]:
|
116 |
-
|
117 |
-
|
118 |
data = pd.concat([datap,datan])
|
119 |
len(data)
|
120 |
|
121 |
-
|
122 |
-
# In[14]:
|
123 |
-
|
124 |
-
|
125 |
c=[]
|
126 |
for i in data['sentiment']:
|
127 |
if i==0:
|
@@ -130,17 +79,6 @@ for i in data['sentiment']:
|
|
130 |
c.append(1)
|
131 |
data['sentiment']=c
|
132 |
|
133 |
-
|
134 |
-
# In[15]:
|
135 |
-
|
136 |
-
|
137 |
-
# sns.countplot(data['sentiment'])
|
138 |
-
# plt.show()
|
139 |
-
|
140 |
-
|
141 |
-
# In[16]:
|
142 |
-
|
143 |
-
|
144 |
def strip_html(text):
|
145 |
soup = BeautifulSoup(text, "html.parser")
|
146 |
return soup.get_text()
|
@@ -150,12 +88,6 @@ data=data.drop('Text',axis=1)
|
|
150 |
|
151 |
#data.head()
|
152 |
|
153 |
-
|
154 |
-
# # MODEL BUILDING
|
155 |
-
|
156 |
-
# In[17]:
|
157 |
-
|
158 |
-
|
159 |
import nltk #Natural Language Processing Toolkit
|
160 |
def punc_clean(text):
|
161 |
import string as st
|
@@ -164,10 +96,6 @@ def punc_clean(text):
|
|
164 |
data['review'] = data['review'].apply(punc_clean)
|
165 |
#data.head(2)
|
166 |
|
167 |
-
|
168 |
-
# In[18]:
|
169 |
-
|
170 |
-
|
171 |
def remove_stopword(text):
|
172 |
stopword=nltk.corpus.stopwords.words('english')
|
173 |
stopword.remove('not')
|
@@ -175,10 +103,6 @@ def remove_stopword(text):
|
|
175 |
return ' '.join(a)
|
176 |
#data['review'] = data['review'].apply(remove_stopword)
|
177 |
|
178 |
-
|
179 |
-
# In[19]:
|
180 |
-
|
181 |
-
|
182 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
183 |
|
184 |
vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
|
@@ -187,9 +111,6 @@ vectr.fit(data['review'])
|
|
187 |
vect_X = vectr.transform(data['review'])
|
188 |
|
189 |
|
190 |
-
# In[20]:
|
191 |
-
|
192 |
-
|
193 |
from sklearn.linear_model import LogisticRegression
|
194 |
|
195 |
model = LogisticRegression()
|
@@ -200,28 +121,11 @@ clf=model.fit(vect_X,data['sentiment'])
|
|
200 |
|
201 |
# # PREDICTION
|
202 |
|
203 |
-
#
|
204 |
-
|
205 |
-
|
206 |
-
clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more ..
|
207 |
-
I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that
|
208 |
-
|
209 |
-
Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
|
210 |
-
|
211 |
-
|
212 |
-
# In[22]:
|
213 |
-
|
214 |
-
|
215 |
-
clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
|
216 |
-
|
217 |
-
|
218 |
-
# In[23]:
|
219 |
-
|
220 |
-
|
221 |
-
clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
|
222 |
|
|
|
223 |
|
224 |
-
#
|
225 |
|
226 |
|
227 |
|
|
|
1 |
import numpy as np # For linear algebra
|
2 |
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
|
3 |
import matplotlib.pyplot as plt # For Visualisation
|
|
|
4 |
import seaborn as sns # For Visualisation
|
5 |
from bs4 import BeautifulSoup # For Text Parsing
|
6 |
|
7 |
|
8 |
# # IMPORTING DATASET
|
9 |
|
|
|
|
|
|
|
10 |
data = pd.read_csv('Reviews.csv')
|
11 |
# data
|
12 |
|
13 |
|
14 |
# # DATA PREPROCESSING & VISUALISATION
|
15 |
|
|
|
|
|
|
|
16 |
#data.isnull().sum()
|
17 |
|
|
|
|
|
|
|
|
|
18 |
data=data.dropna()
|
19 |
#data.isnull().sum()
|
20 |
|
|
|
|
|
|
|
|
|
21 |
#data.shape
|
22 |
|
|
|
|
|
|
|
|
|
23 |
score_unique = data['Score'].unique()
|
24 |
#print(score_unique)
|
25 |
|
|
|
|
|
|
|
|
|
26 |
# 0-> NEGATIVE REVIEW
|
27 |
# 1-> NEUTRAL REVIEW
|
28 |
# 2-> POSTIVE REVIEW
|
|
|
35 |
if i>3:
|
36 |
a.append(2)
|
37 |
|
|
|
|
|
|
|
|
|
38 |
r_0, r_1, r_2 = 0, 0, 0
|
39 |
for i in a:
|
40 |
if i == 0:
|
|
|
48 |
# print('Neutral Reviews:',r_1)
|
49 |
# print('Positive Reviews:',r_2)
|
50 |
|
|
|
|
|
|
|
|
|
51 |
# sns.countplot(a)
|
52 |
# plt.xlabel('Reviews', color = 'red')
|
53 |
# plt.ylabel('Count', color = 'red')
|
|
|
55 |
# plt.title('COUNT PLOT', color = 'r')
|
56 |
# plt.show()
|
57 |
|
|
|
|
|
|
|
|
|
58 |
data['sentiment']=a
|
59 |
#data
|
60 |
final_dataset = data[['Text','sentiment']]
|
61 |
#final_dataset
|
62 |
|
|
|
|
|
|
|
|
|
63 |
data_p=final_dataset[data['sentiment']==2]
|
64 |
data_n=final_dataset[data['sentiment']==0]
|
65 |
#len(data_p), len(data_n)
|
66 |
|
|
|
|
|
|
|
|
|
67 |
datap = data_p.iloc[np.random.randint(1,443766,5000), :]
|
68 |
datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
|
69 |
#len(datan), len(datap)
|
70 |
|
|
|
|
|
|
|
|
|
71 |
data = pd.concat([datap,datan])
|
72 |
len(data)
|
73 |
|
|
|
|
|
|
|
|
|
74 |
c=[]
|
75 |
for i in data['sentiment']:
|
76 |
if i==0:
|
|
|
79 |
c.append(1)
|
80 |
data['sentiment']=c
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def strip_html(text):
|
83 |
soup = BeautifulSoup(text, "html.parser")
|
84 |
return soup.get_text()
|
|
|
88 |
|
89 |
#data.head()
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
import nltk #Natural Language Processing Toolkit
|
92 |
def punc_clean(text):
|
93 |
import string as st
|
|
|
96 |
data['review'] = data['review'].apply(punc_clean)
|
97 |
#data.head(2)
|
98 |
|
|
|
|
|
|
|
|
|
99 |
def remove_stopword(text):
|
100 |
stopword=nltk.corpus.stopwords.words('english')
|
101 |
stopword.remove('not')
|
|
|
103 |
return ' '.join(a)
|
104 |
#data['review'] = data['review'].apply(remove_stopword)
|
105 |
|
|
|
|
|
|
|
|
|
106 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
107 |
|
108 |
vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
|
|
|
111 |
vect_X = vectr.transform(data['review'])
|
112 |
|
113 |
|
|
|
|
|
|
|
114 |
from sklearn.linear_model import LogisticRegression
|
115 |
|
116 |
model = LogisticRegression()
|
|
|
121 |
|
122 |
# # PREDICTION
|
123 |
|
124 |
+
# clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more .. I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that, Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
+
# clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
|
127 |
|
128 |
+
# clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
|
129 |
|
130 |
|
131 |
|