srini047 commited on
Commit
3328b56
1 Parent(s): 1f2e957

[clean]: removed chunks

Browse files
Files changed (1) hide show
  1. sentiment.py +3 -99
sentiment.py CHANGED
@@ -1,51 +1,28 @@
1
  import numpy as np # For linear algebra
2
  import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
3
  import matplotlib.pyplot as plt # For Visualisation
4
- # get_ipython().run_line_magic('matplotlib', 'inline')
5
  import seaborn as sns # For Visualisation
6
  from bs4 import BeautifulSoup # For Text Parsing
7
 
8
 
9
  # # IMPORTING DATASET
10
 
11
- # In[2]:
12
-
13
-
14
  data = pd.read_csv('Reviews.csv')
15
  # data
16
 
17
 
18
  # # DATA PREPROCESSING & VISUALISATION
19
 
20
- # In[3]:
21
-
22
-
23
  #data.isnull().sum()
24
 
25
-
26
- # In[4]:
27
-
28
-
29
  data=data.dropna()
30
  #data.isnull().sum()
31
 
32
-
33
- # In[5]:
34
-
35
-
36
  #data.shape
37
 
38
-
39
- # In[6]:
40
-
41
-
42
  score_unique = data['Score'].unique()
43
  #print(score_unique)
44
 
45
-
46
- # In[7]:
47
-
48
-
49
  # 0-> NEGATIVE REVIEW
50
  # 1-> NEUTRAL REVIEW
51
  # 2-> POSTIVE REVIEW
@@ -58,10 +35,6 @@ for i in data['Score']:
58
  if i>3:
59
  a.append(2)
60
 
61
-
62
- # In[8]:
63
-
64
-
65
  r_0, r_1, r_2 = 0, 0, 0
66
  for i in a:
67
  if i == 0:
@@ -75,10 +48,6 @@ for i in a:
75
  # print('Neutral Reviews:',r_1)
76
  # print('Positive Reviews:',r_2)
77
 
78
-
79
- # In[9]:
80
-
81
-
82
  # sns.countplot(a)
83
  # plt.xlabel('Reviews', color = 'red')
84
  # plt.ylabel('Count', color = 'red')
@@ -86,42 +55,22 @@ for i in a:
86
  # plt.title('COUNT PLOT', color = 'r')
87
  # plt.show()
88
 
89
-
90
- # In[10]:
91
-
92
-
93
  data['sentiment']=a
94
  #data
95
  final_dataset = data[['Text','sentiment']]
96
  #final_dataset
97
 
98
-
99
- # In[11]:
100
-
101
-
102
  data_p=final_dataset[data['sentiment']==2]
103
  data_n=final_dataset[data['sentiment']==0]
104
  #len(data_p), len(data_n)
105
 
106
-
107
- # In[12]:
108
-
109
-
110
  datap = data_p.iloc[np.random.randint(1,443766,5000), :]
111
  datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
112
  #len(datan), len(datap)
113
 
114
-
115
- # In[13]:
116
-
117
-
118
  data = pd.concat([datap,datan])
119
  len(data)
120
 
121
-
122
- # In[14]:
123
-
124
-
125
  c=[]
126
  for i in data['sentiment']:
127
  if i==0:
@@ -130,17 +79,6 @@ for i in data['sentiment']:
130
  c.append(1)
131
  data['sentiment']=c
132
 
133
-
134
- # In[15]:
135
-
136
-
137
- # sns.countplot(data['sentiment'])
138
- # plt.show()
139
-
140
-
141
- # In[16]:
142
-
143
-
144
  def strip_html(text):
145
  soup = BeautifulSoup(text, "html.parser")
146
  return soup.get_text()
@@ -150,12 +88,6 @@ data=data.drop('Text',axis=1)
150
 
151
  #data.head()
152
 
153
-
154
- # # MODEL BUILDING
155
-
156
- # In[17]:
157
-
158
-
159
  import nltk #Natural Language Processing Toolkit
160
  def punc_clean(text):
161
  import string as st
@@ -164,10 +96,6 @@ def punc_clean(text):
164
  data['review'] = data['review'].apply(punc_clean)
165
  #data.head(2)
166
 
167
-
168
- # In[18]:
169
-
170
-
171
  def remove_stopword(text):
172
  stopword=nltk.corpus.stopwords.words('english')
173
  stopword.remove('not')
@@ -175,10 +103,6 @@ def remove_stopword(text):
175
  return ' '.join(a)
176
  #data['review'] = data['review'].apply(remove_stopword)
177
 
178
-
179
- # In[19]:
180
-
181
-
182
  from sklearn.feature_extraction.text import TfidfVectorizer
183
 
184
  vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
@@ -187,9 +111,6 @@ vectr.fit(data['review'])
187
  vect_X = vectr.transform(data['review'])
188
 
189
 
190
- # In[20]:
191
-
192
-
193
  from sklearn.linear_model import LogisticRegression
194
 
195
  model = LogisticRegression()
@@ -200,28 +121,11 @@ clf=model.fit(vect_X,data['sentiment'])
200
 
201
  # # PREDICTION
202
 
203
- # In[21]:
204
-
205
-
206
- clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more ..
207
- I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that
208
-
209
- Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
210
-
211
-
212
- # In[22]:
213
-
214
-
215
- clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
216
-
217
-
218
- # In[23]:
219
-
220
-
221
- clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
222
 
 
223
 
224
- # In[ ]:
225
 
226
 
227
 
 
1
  import numpy as np # For linear algebra
2
  import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
3
  import matplotlib.pyplot as plt # For Visualisation
 
4
  import seaborn as sns # For Visualisation
5
  from bs4 import BeautifulSoup # For Text Parsing
6
 
7
 
8
  # # IMPORTING DATASET
9
 
 
 
 
10
  data = pd.read_csv('Reviews.csv')
11
  # data
12
 
13
 
14
  # # DATA PREPROCESSING & VISUALISATION
15
 
 
 
 
16
  #data.isnull().sum()
17
 
 
 
 
 
18
  data=data.dropna()
19
  #data.isnull().sum()
20
 
 
 
 
 
21
  #data.shape
22
 
 
 
 
 
23
  score_unique = data['Score'].unique()
24
  #print(score_unique)
25
 
 
 
 
 
26
  # 0-> NEGATIVE REVIEW
27
  # 1-> NEUTRAL REVIEW
28
  # 2-> POSTIVE REVIEW
 
35
  if i>3:
36
  a.append(2)
37
 
 
 
 
 
38
  r_0, r_1, r_2 = 0, 0, 0
39
  for i in a:
40
  if i == 0:
 
48
  # print('Neutral Reviews:',r_1)
49
  # print('Positive Reviews:',r_2)
50
 
 
 
 
 
51
  # sns.countplot(a)
52
  # plt.xlabel('Reviews', color = 'red')
53
  # plt.ylabel('Count', color = 'red')
 
55
  # plt.title('COUNT PLOT', color = 'r')
56
  # plt.show()
57
 
 
 
 
 
58
  data['sentiment']=a
59
  #data
60
  final_dataset = data[['Text','sentiment']]
61
  #final_dataset
62
 
 
 
 
 
63
  data_p=final_dataset[data['sentiment']==2]
64
  data_n=final_dataset[data['sentiment']==0]
65
  #len(data_p), len(data_n)
66
 
 
 
 
 
67
  datap = data_p.iloc[np.random.randint(1,443766,5000), :]
68
  datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
69
  #len(datan), len(datap)
70
 
 
 
 
 
71
  data = pd.concat([datap,datan])
72
  len(data)
73
 
 
 
 
 
74
  c=[]
75
  for i in data['sentiment']:
76
  if i==0:
 
79
  c.append(1)
80
  data['sentiment']=c
81
 
 
 
 
 
 
 
 
 
 
 
 
82
  def strip_html(text):
83
  soup = BeautifulSoup(text, "html.parser")
84
  return soup.get_text()
 
88
 
89
  #data.head()
90
 
 
 
 
 
 
 
91
  import nltk #Natural Language Processing Toolkit
92
  def punc_clean(text):
93
  import string as st
 
96
  data['review'] = data['review'].apply(punc_clean)
97
  #data.head(2)
98
 
 
 
 
 
99
  def remove_stopword(text):
100
  stopword=nltk.corpus.stopwords.words('english')
101
  stopword.remove('not')
 
103
  return ' '.join(a)
104
  #data['review'] = data['review'].apply(remove_stopword)
105
 
 
 
 
 
106
  from sklearn.feature_extraction.text import TfidfVectorizer
107
 
108
  vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
 
111
  vect_X = vectr.transform(data['review'])
112
 
113
 
 
 
 
114
  from sklearn.linear_model import LogisticRegression
115
 
116
  model = LogisticRegression()
 
121
 
122
  # # PREDICTION
123
 
124
+ # clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more .. I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that, Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
127
 
128
+ # clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
129
 
130
 
131