srini047 commited on
Commit
df00128
·
1 Parent(s): 074cf1f

added function file

Browse files
Files changed (1) hide show
  1. sentiment.py +238 -0
sentiment.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # # Text Based Sentiment Analysis
5
+
6
+ # # IMPORTING NECESSARY MODULES
7
+
8
+ # In[1]:
9
+
10
+
11
+ import numpy as np # For linear algebra
12
+ import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
13
+ import matplotlib.pyplot as plt # For Visualisation
14
+ # get_ipython().run_line_magic('matplotlib', 'inline')
15
+ import seaborn as sns # For Visualisation
16
+ from bs4 import BeautifulSoup # For Text Parsing
17
+
18
+
19
+ # # IMPORTING DATASET
20
+
21
+ # In[2]:
22
+
23
+
24
+ data = pd.read_csv('Reviews.csv')
25
+ # data
26
+
27
+
28
+ # # DATA PREPROCESSING & VISUALISATION
29
+
30
+ # In[3]:
31
+
32
+
33
+ #data.isnull().sum()
34
+
35
+
36
+ # In[4]:
37
+
38
+
39
+ data=data.dropna()
40
+ #data.isnull().sum()
41
+
42
+
43
+ # In[5]:
44
+
45
+
46
+ #data.shape
47
+
48
+
49
+ # In[6]:
50
+
51
+
52
+ score_unique = data['Score'].unique()
53
+ #print(score_unique)
54
+
55
+
56
+ # In[7]:
57
+
58
+
59
+ # 0-> NEGATIVE REVIEW
60
+ # 1-> NEUTRAL REVIEW
61
+ # 2-> POSTIVE REVIEW
62
+ a=[]
63
+ for i in data['Score']:
64
+ if i <3:
65
+ a.append(0)
66
+ if i==3:
67
+ a.append(1)
68
+ if i>3:
69
+ a.append(2)
70
+
71
+
72
+ # In[8]:
73
+
74
+
75
+ r_0, r_1, r_2 = 0, 0, 0
76
+ for i in a:
77
+ if i == 0:
78
+ r_0 += 1
79
+ elif i == 1:
80
+ r_1 += 1
81
+ else:
82
+ r_2 += 1
83
+
84
+ # print('Negative Reviews:',r_0)
85
+ # print('Neutral Reviews:',r_1)
86
+ # print('Positive Reviews:',r_2)
87
+
88
+
89
+ # In[9]:
90
+
91
+
92
+ # sns.countplot(a)
93
+ # plt.xlabel('Reviews', color = 'red')
94
+ # plt.ylabel('Count', color = 'red')
95
+ # plt.xticks([0,1,2],['Negative','Neutral','Positive'])
96
+ # plt.title('COUNT PLOT', color = 'r')
97
+ # plt.show()
98
+
99
+
100
+ # In[10]:
101
+
102
+
103
+ data['sentiment']=a
104
+ #data
105
+ final_dataset = data[['Text','sentiment']]
106
+ #final_dataset
107
+
108
+
109
+ # In[11]:
110
+
111
+
112
+ data_p=final_dataset[data['sentiment']==2]
113
+ data_n=final_dataset[data['sentiment']==0]
114
+ #len(data_p), len(data_n)
115
+
116
+
117
+ # In[12]:
118
+
119
+
120
+ datap = data_p.iloc[np.random.randint(1,443766,5000), :]
121
+ datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
122
+ #len(datan), len(datap)
123
+
124
+
125
+ # In[13]:
126
+
127
+
128
+ data = pd.concat([datap,datan])
129
+ len(data)
130
+
131
+
132
+ # In[14]:
133
+
134
+
135
+ c=[]
136
+ for i in data['sentiment']:
137
+ if i==0:
138
+ c.append(0)
139
+ if i==2:
140
+ c.append(1)
141
+ data['sentiment']=c
142
+
143
+
144
+ # In[15]:
145
+
146
+
147
+ # sns.countplot(data['sentiment'])
148
+ # plt.show()
149
+
150
+
151
+ # In[16]:
152
+
153
+
154
+ def strip_html(text):
155
+ soup = BeautifulSoup(text, "html.parser")
156
+ return soup.get_text()
157
+ data['review'] = data['Text'].apply(strip_html)
158
+
159
+ data=data.drop('Text',axis=1)
160
+
161
+ #data.head()
162
+
163
+
164
+ # # MODEL BUILDING
165
+
166
+ # In[17]:
167
+
168
+
169
+ import nltk #Natural Language Processing Toolkit
170
+ def punc_clean(text):
171
+ import string as st
172
+ a=[w for w in text if w not in st.punctuation]
173
+ return ''.join(a)
174
+ data['review'] = data['review'].apply(punc_clean)
175
+ #data.head(2)
176
+
177
+
178
+ # In[18]:
179
+
180
+
181
+ def remove_stopword(text):
182
+ stopword=nltk.corpus.stopwords.words('english')
183
+ stopword.remove('not')
184
+ a=[w for w in nltk.word_tokenize(text) if w not in stopword]
185
+ return ' '.join(a)
186
+ #data['review'] = data['review'].apply(remove_stopword)
187
+
188
+
189
+ # In[19]:
190
+
191
+
192
+ from sklearn.feature_extraction.text import TfidfVectorizer
193
+
194
+ vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
195
+ vectr.fit(data['review'])
196
+
197
+ vect_X = vectr.transform(data['review'])
198
+
199
+
200
+ # In[20]:
201
+
202
+
203
+ from sklearn.linear_model import LogisticRegression
204
+
205
+ model = LogisticRegression()
206
+
207
+ clf=model.fit(vect_X,data['sentiment'])
208
+ #clf.score(vect_X,data['sentiment'])*100
209
+
210
+
211
+ # # PREDICTION
212
+
213
+ # In[21]:
214
+
215
+
216
+ clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more ..
217
+ I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that
218
+
219
+ Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
220
+
221
+
222
+ # In[22]:
223
+
224
+
225
+ clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
226
+
227
+
228
+ # In[23]:
229
+
230
+
231
+ clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
232
+
233
+
234
+ # In[ ]:
235
+
236
+
237
+
238
+