Pushpa commited on
Commit
6ff2b24
1 Parent(s): 5eb4136

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +816 -0
app.py ADDED
@@ -0,0 +1,816 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Survey_Analysis_v_3_2_86.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS
8
+ """
9
+
10
+ #1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
11
+ #2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
12
+
13
+ import streamlit
14
+
15
+
16
+
17
+ # Commented out IPython magic to ensure Python compatibility.
18
+ import numpy as np
19
+ import pandas as pd
20
+ import seaborn as sns
21
+ import matplotlib.pyplot as plt
22
+ import plotly.express as px
23
+ import plotly.graph_objects as go
24
+
25
+
26
+ import pygal as py
27
+ import squarify as sq
28
+
29
+ plt.rcParams["figure.figsize"] = (20,15)
30
+ matplotlib.rc('xtick', labelsize=7)
31
+ matplotlib.rc('ytick', labelsize=7)
32
+
33
+ font = {'family' : 'normal',
34
+ 'weight' : 'bold',
35
+ 'size' : 5}
36
+
37
+ matplotlib.rc('font', **font)
38
+ from sklearn.feature_extraction.text import CountVectorizer
39
+ import warnings
40
+ warnings.filterwarnings("ignore", category=FutureWarning)
41
+ # %matplotlib inline
42
+
43
+ df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
44
+ df
45
+
46
+ col1=df.keys()[0]
47
+ col2=df.keys()[1]
48
+ col2
49
+
50
+ df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
51
+
52
+ df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
53
+
54
+ df
55
+
56
+ df = df.replace("neutral","neutral")
57
+
58
+ sns.countplot(y="sentiment",data=df)
59
+
60
+ df.isnull().sum()
61
+
62
+ from textblob import TextBlob
63
+
64
+ def preprocess(ReviewText):
65
+ ReviewText = ReviewText.str.replace("(<br/>)", "")
66
+ ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
67
+ ReviewText = ReviewText.str.replace('(&amp)', '')
68
+ ReviewText = ReviewText.str.replace('(&gt)', '')
69
+ ReviewText = ReviewText.str.replace('(&lt)', '')
70
+ ReviewText = ReviewText.str.replace('(\xa0)', ' ')
71
+ return ReviewText
72
+ df['Review Text'] = preprocess(df['news'])
73
+
74
+ df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
75
+ df['news_len'] = df['news'].astype(str).apply(len)
76
+ df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
77
+
78
+ df
79
+
80
+ print('top 4 random reviews with the highest positive sentiment polarity: \n')
81
+
82
+ df1=df.drop_duplicates(subset=['Review Text'])
83
+
84
+ cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
85
+ for c in cl:
86
+ print(c[0])
87
+
88
+ print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
89
+ cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
90
+ for c in cl1:
91
+ print(c[0])
92
+
93
+ print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
94
+ cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
95
+ for c in cl3:
96
+ print(c[0])
97
+
98
+ sns.boxplot(df["polarity"],palette="rainbow",data=df)
99
+
100
+ df['polarity'].plot(
101
+ kind='hist',
102
+ bins=50,
103
+ color="peru",
104
+ title='Sentiment Polarity Distribution');plt.show()
105
+
106
+ p_s=df[df["polarity"]>0].count()["sentiment"]
107
+ neu_s=df[df["polarity"]==0].count()["sentiment"]
108
+ neg_s=df[df["polarity"]<0].count()["sentiment"]
109
+
110
+ # Setting labels for items in Chart
111
+ sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
112
+
113
+ # Setting size in Chart based on
114
+ # given values
115
+ values = [p_s,neu_s,neg_s]
116
+
117
+ # colors
118
+ colors = ['#FF0000', 'olive', '#FFFF00']
119
+ # explosion
120
+ explode = (0.05, 0.05, 0.05)
121
+
122
+ # Pie Chart
123
+ plt.pie(values, colors=colors, labels=sentiment,
124
+ autopct='%1.1f%%', pctdistance=0.85,
125
+ explode=explode)
126
+
127
+ # draw circle
128
+ centre_circle = plt.Circle((0, 0), 0.70, fc='white')
129
+ fig = plt.gcf()
130
+
131
+ # Adding Circle in Pie chart
132
+ fig.gca().add_artist(centre_circle)
133
+
134
+ # Adding Title of chart
135
+ plt.title('count of polarity as per sentiment')
136
+
137
+ # Displaing Chart
138
+ plt.show()
139
+
140
+ df.plot.box(y=["word_count"],color="hotpink")
141
+
142
+ df['word_count'].plot(
143
+ kind='hist',
144
+ bins=100,
145
+ color="orange",
146
+ title='Review Text Word Count Distribution');plt.show()
147
+
148
+ sns.boxenplot(x="news_len",data=df)
149
+ plt.show()
150
+
151
+ df['news_len'].plot(
152
+ kind='hist',
153
+ bins=50,
154
+ color="lightblue",
155
+ title='Review Text Word Count Distribution');plt.show()
156
+
157
+ fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
158
+ marginal_x="box", marginal_y="violin",
159
+ title="Click on the legend items!")
160
+ fig.show()
161
+
162
+ def get_top_n_words(corpus, n=None):
163
+ vec = CountVectorizer().fit(corpus)
164
+ bag_of_words = vec.transform(corpus)
165
+ sum_words = bag_of_words.sum(axis=0)
166
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
167
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
168
+ return words_freq[:n]
169
+ common_words = get_top_n_words(df['Review Text'], 20)
170
+ for word, freq in common_words:
171
+ print(word, freq)
172
+ df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
173
+ df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
174
+ kind='bar',title='Top 20 words in review before removing stop words')
175
+ df1
176
+
177
+ def get_top_n_words(corpus, n=None):
178
+ vec = CountVectorizer(stop_words = 'english').fit(corpus)
179
+ bag_of_words = vec.transform(corpus)
180
+ sum_words = bag_of_words.sum(axis=0)
181
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
182
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
183
+ return words_freq[:n]
184
+ common_words = get_top_n_words(df['Review Text'], 20)
185
+ for word, freq in common_words:
186
+ print(word, freq)
187
+ df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
188
+ df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
189
+
190
+ def get_top_n_bigram(corpus, n=None):
191
+ vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
192
+ bag_of_words = vec.transform(corpus)
193
+ sum_words = bag_of_words.sum(axis=0)
194
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
195
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
196
+ return words_freq[:n]
197
+ common_words = get_top_n_bigram(df['Review Text'], 20)
198
+ for word, freq in common_words:
199
+ print(word, freq)
200
+ df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
201
+ df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
202
+ kind='bar',title='Top 20 bigrams in review before removing stop words')
203
+
204
+ def get_top_n_bigram(corpus, n=None):
205
+ vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
206
+ bag_of_words = vec.transform(corpus)
207
+ sum_words = bag_of_words.sum(axis=0)
208
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
209
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
210
+ return words_freq[:n]
211
+ common_words = get_top_n_bigram(df['Review Text'], 20)
212
+ for word, freq in common_words:
213
+ print(word, freq)
214
+ df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
215
+ df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
216
+ kind='bar', title='Top 20 bigrams in review after removing stop words')
217
+
218
+ def get_top_n_trigram(corpus, n=None):
219
+ vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
220
+ bag_of_words = vec.transform(corpus)
221
+ sum_words = bag_of_words.sum(axis=0)
222
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
223
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
224
+ return words_freq[:n]
225
+ common_words = get_top_n_trigram(df['Review Text'], 20)
226
+ for word, freq in common_words:
227
+ print(word, freq)
228
+ df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
229
+ df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
230
+ kind='bar', title='Top 20 trigrams in review before removing stop words')
231
+
232
+ def get_top_n_trigram(corpus, n=None):
233
+ vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
234
+ bag_of_words = vec.transform(corpus)
235
+ sum_words = bag_of_words.sum(axis=0)
236
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
237
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
238
+ return words_freq[:n]
239
+ common_words = get_top_n_trigram(df['Review Text'], 20)
240
+ for word, freq in common_words:
241
+ print(word, freq)
242
+ df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
243
+ df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
244
+ kind='bar', title='Top 20 trigrams in review after removing stop words')
245
+
246
+ import nltk
247
+ nltk.download('punkt')
248
+ nltk.download('wordnet')
249
+ nltk.download('omw-1.4')
250
+ nltk.download('averaged_perceptron_tagger')
251
+
252
+ #import nltk
253
+ blob = TextBlob(str(df['Review Text']))
254
+ pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
255
+ pos_df = pos_df.pos.value_counts()[:20]
256
+ pos_df.plot(
257
+ kind='bar',
258
+ title='Top 20 Part-of-speech tagging for review corpus')
259
+
260
+ y0 = df.loc[df['sentiment'] == 'positive']['polarity']
261
+ y1 = df.loc[df['sentiment'] == 'negative']['polarity']
262
+ y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
263
+
264
+ trace0 = go.Box(
265
+ y=y0,
266
+ name = 'positive',
267
+ marker = dict(
268
+ color = 'rgb(214, 12, 140)',
269
+ )
270
+ )
271
+ trace1 = go.Box(
272
+ y=y1,
273
+ name = 'negative',
274
+ marker = dict(
275
+ color = 'rgb(0, 128, 128)',
276
+ )
277
+ )
278
+ trace2 = go.Box(
279
+ y=y2,
280
+ name = 'neutral',
281
+ marker = dict(
282
+ color = 'rgb(10, 140, 208)',
283
+ )
284
+ )
285
+ data = [trace0, trace1, trace2]
286
+ layout = go.Layout(
287
+ title = "Polarity Boxplot according to sentiment"
288
+ )
289
+
290
+ go.Figure(data=data,layout=layout)
291
+
292
+ y0 = df.loc[df['sentiment'] == 'positive']['news_len']
293
+ y1 = df.loc[df['sentiment'] == 'negative']['news_len']
294
+ y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
295
+
296
+
297
+ trace0 = go.Box(
298
+ y=y0,
299
+ name = 'positive',
300
+ marker = dict(
301
+ color = 'rgb(214, 12, 140)',
302
+ )
303
+ )
304
+ trace1 = go.Box(
305
+ y=y1,
306
+ name = 'negative',
307
+ marker = dict(
308
+ color = 'rgb(0, 128, 128)',
309
+ )
310
+ )
311
+ trace2 = go.Box(
312
+ y=y2,
313
+ name = 'neutral',
314
+ marker = dict(
315
+ color = 'rgb(10, 140, 208)',
316
+ )
317
+ )
318
+ data = [trace0, trace1, trace2]
319
+ layout = go.Layout(
320
+ title = "news length Boxplot by sentiment"
321
+ )
322
+ go.Figure(data=data,layout=layout)
323
+
324
+ xp = df.loc[df['sentiment'] == "positive", 'polarity']
325
+ xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
326
+ xneg= df.loc[df['sentiment'] == "negative", 'polarity']
327
+
328
+ trace1 = go.Histogram(
329
+ x=xp, name='positive',
330
+ opacity=0.75
331
+ )
332
+ trace2 = go.Histogram(
333
+ x=xneu, name = 'neutral',
334
+ opacity=0.75
335
+ )
336
+ trace3 = go.Histogram(
337
+ x=xneg, name = 'negative',
338
+ opacity=0.75
339
+ )
340
+ data = [trace1, trace2,trace3]
341
+ layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
342
+ go.Figure(data=data, layout=layout)
343
+
344
+ trace1 = go.Scatter(
345
+ x=df['polarity'], y=df['news_len'], mode='markers', name='points',
346
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
347
+ )
348
+ trace2 = go.Histogram2dContour(
349
+ x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
350
+ colorscale='Hot', reversescale=True, showscale=False
351
+ )
352
+ trace3 = go.Histogram(
353
+ x=df['polarity'], name='Sentiment polarity density',
354
+ marker=dict(color='rgb(102,0,0)'),
355
+ yaxis='y2'
356
+ )
357
+ trace4 = go.Histogram(
358
+ y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
359
+ xaxis='x2'
360
+ )
361
+ data = [trace1, trace2, trace3, trace4]
362
+
363
+ layout = go.Layout(
364
+ showlegend=False,
365
+ autosize=False,
366
+ width=600,
367
+ height=550,
368
+ xaxis=dict(
369
+ domain=[0, 0.85],
370
+ showgrid=False,
371
+ zeroline=False
372
+ ),
373
+ yaxis=dict(
374
+ domain=[0, 0.85],
375
+ showgrid=False,
376
+ zeroline=False
377
+ ),
378
+ margin=dict(
379
+ t=50
380
+ ),
381
+ hovermode='x unified',
382
+ bargap=0,
383
+ xaxis2=dict(
384
+ domain=[0.85, 1],
385
+ showgrid=False,
386
+ zeroline=False
387
+ ),
388
+ yaxis2=dict(
389
+ domain=[0.85, 1],
390
+ showgrid=False,
391
+ zeroline=False
392
+ )
393
+ )
394
+
395
+ go.Figure(data=data, layout=layout)
396
+
397
+ trace1 = go.Scatter(
398
+ x=df['polarity'], y=df['word_count'], mode='markers', name='points',
399
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
400
+ )
401
+ trace2 = go.Histogram2dContour(
402
+ x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
403
+ colorscale='Hot', reversescale=True, showscale=False
404
+ )
405
+ trace3 = go.Histogram(
406
+ x=df['polarity'], name='Sentiment polarity density',
407
+ marker=dict(color='rgb(102,0,0)'),
408
+ yaxis='y2'
409
+ )
410
+ trace4 = go.Histogram(
411
+ y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
412
+ xaxis='x2'
413
+ )
414
+ data = [trace1, trace2, trace3, trace4]
415
+
416
+ layout = go.Layout(
417
+ showlegend=False,
418
+ autosize=False,
419
+ width=600,
420
+ height=550,
421
+ xaxis=dict(
422
+ domain=[0, 0.85],
423
+ showgrid=False,
424
+ zeroline=False
425
+ ),
426
+ yaxis=dict(
427
+ domain=[0, 0.85],
428
+ showgrid=False,
429
+ zeroline=False
430
+ ),
431
+ margin=dict(
432
+ t=50
433
+ ),
434
+ hovermode='closest',
435
+ bargap=0,
436
+ xaxis2=dict(
437
+ domain=[0.85, 1],
438
+ showgrid=False,
439
+ zeroline=False
440
+ ),
441
+ yaxis2=dict(
442
+ domain=[0.85, 1],
443
+ showgrid=False,
444
+ zeroline=False
445
+ )
446
+ )
447
+
448
+ go.Figure(data=data, layout=layout)
449
+
450
+
451
+ import scattertext as st
452
+ import spacy
453
+ nlp = spacy.blank("en")
454
+ nlp.add_pipe('sentencizer')
455
+ #nlp.add_pipe(nlp.create_pipe('sentencizer'))
456
+ corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
457
+ print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
458
+
459
+ term_freq_df = corpus.get_term_freq_df()
460
+ term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
461
+ list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
462
+
463
+ term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
464
+ list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
465
+
466
+ term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
467
+ list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
468
+
469
+ from sklearn.feature_extraction.text import TfidfVectorizer
470
+ from sklearn.decomposition import TruncatedSVD
471
+ from collections import Counter
472
+
473
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
474
+ reindexed_data = df['Review Text'].values
475
+ document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
476
+ n_topics = 10
477
+ lsa_model = TruncatedSVD(n_components=n_topics)
478
+ lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
479
+
480
+ def get_keys(topic_matrix):
481
+ '''
482
+ returns an integer list of predicted topic
483
+ categories for a given topic matrix
484
+ '''
485
+ keys = topic_matrix.argmax(axis=1).tolist()
486
+ return keys
487
+
488
+ def keys_to_counts(keys):
489
+ '''
490
+ returns a tuple of topic categories and their
491
+ accompanying magnitudes for a given list of keys
492
+ '''
493
+ count_pairs = Counter(keys).items()
494
+ categories = [pair[0] for pair in count_pairs]
495
+ counts = [pair[1] for pair in count_pairs]
496
+ return (categories, counts)
497
+
498
+ lsa_keys = get_keys(lsa_topic_matrix)
499
+ lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
500
+
501
+ def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
502
+ '''
503
+ returns a list of n_topic strings, where each string contains the n most common
504
+ words in a predicted category, in order
505
+ '''
506
+ top_word_indices = []
507
+ for topic in range(n_topics):
508
+ temp_vector_sum = 0
509
+ for i in range(len(keys)):
510
+ if keys[i] == topic:
511
+ temp_vector_sum += document_term_matrix[i]
512
+ temp_vector_sum = temp_vector_sum.toarray()
513
+ top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
514
+ top_word_indices.append(top_n_word_indices)
515
+ top_words = []
516
+ for topic in top_word_indices:
517
+ topic_words = []
518
+ for index in topic:
519
+ temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
520
+ temp_word_vector[:,index] = 1
521
+ the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
522
+ topic_words.append(the_word.encode('ascii').decode('utf-8'))
523
+ top_words.append(" ".join(topic_words))
524
+ return top_words
525
+
526
+ top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
527
+
528
+ for i in range(len(top_lsa)):
529
+ print("Topic {}: ".format(i+1), top_lsa[i])
530
+
531
+ top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
532
+ labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
533
+ fig, ax = plt.subplots(figsize=(16,8))
534
+ ax.bar(lsa_categories, lsa_counts,color="skyblue");
535
+ ax.set_xticks(lsa_categories,);
536
+ ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
537
+ ax.set_ylabel('Number of review text on topics');
538
+ ax.set_title('Count of LSA topics');
539
+ plt.show();
540
+
541
+ """#---2----"""
542
+
543
+ df['sentiment'].value_counts()
544
+
545
+ from sklearn.model_selection import train_test_split
546
+ train,eva = train_test_split(df,test_size = 0.2)
547
+
548
+
549
+ from simpletransformers.classification import ClassificationModel
550
+
551
+ # Create a Transformer Model BERT
552
+ model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
553
+
554
+ # 0,1,2 : positive,negative
555
+ def making_label(st):
556
+ if(st=='positive'):
557
+ return 0
558
+ elif(st=='neutral'):
559
+ return 2
560
+ else:
561
+ return 1
562
+
563
+ train['label'] = train['sentiment'].apply(making_label)
564
+ eva['label'] = eva['sentiment'].apply(making_label)
565
+ print(train.shape)
566
+
567
+ train_df = pd.DataFrame({
568
+ 'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
569
+ 'label': train['label'][:1500]
570
+ })
571
+
572
+ eval_df = pd.DataFrame({
573
+ 'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
574
+ 'label': eva['label'][-400:]
575
+ })
576
+
577
+ model.train_model(train_df)
578
+
579
+ result, model_outputs, wrong_predictions = model.eval_model(eval_df)
580
+
581
+ result
582
+
583
+ model_outputs
584
+
585
+ len(wrong_predictions)
586
+
587
+ lst = []
588
+ for arr in model_outputs:
589
+ lst.append(np.argmax(arr))
590
+
591
+ true = eval_df['label'].tolist()
592
+ predicted = lst
593
+
594
+ import sklearn
595
+ mat = sklearn.metrics.confusion_matrix(true , predicted)
596
+ mat
597
+
598
+ df_cm = pd.DataFrame(mat, range(3), range(3))
599
+
600
+ sns.heatmap(df_cm, annot=True)
601
+ plt.show()
602
+
603
+ print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
604
+
605
+ sklearn.metrics.accuracy_score(true,predicted)
606
+
607
+ #Give your statement
608
+ def get_result(statement):
609
+ result = model.predict([statement])
610
+ pos = np.where(result[1][0] == np.amax(result[1][0]))
611
+ pos = int(pos[0])
612
+ sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
613
+ print(sentiment_dict[pos])
614
+ return
615
+
616
+ ## neutral statement
617
+ get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
618
+
619
+ ## positive statement
620
+ get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
621
+
622
+ ## negative statement
623
+ get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
624
+
625
+ get_result("This company is growing like anything with 23% profit every year")
626
+
627
+ get_result("This company is not able to make any profit but make very less profit in last quarter")
628
+
629
+ get_result("The doctor treated well and the patient was very healthy")
630
+
631
+ get_result("the act of politicians is to serve and help needy and not to create ruck suck")
632
+
633
+ get_result("American burger is too good. Can't resisit to go and have one")
634
+
635
+ get_result("GDP per capita increased to double in India from 2013")
636
+
637
+ get_result("Indian economy is doing very good and will become super power one day.")
638
+
639
+ get_result("Indian economy is doing very good and will create millions of jobs in coming years")
640
+
641
+ get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
642
+
643
+ get_result("Indian economy is doing very good.Indian economy is not doing very good ")
644
+
645
+ get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
646
+
647
+ get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
648
+
649
+ get_result("The stock market of Indian economy is dangling too much")
650
+
651
+ """#VADER"""
652
+
653
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
654
+
655
+ obj = SentimentIntensityAnalyzer()
656
+
657
+ sentence = "Ram is really good "
658
+ sentiment_dict = obj.polarity_scores(sentence)
659
+ print(sentiment_dict)
660
+
661
+ #check this
662
+ sentence = "Ram is better "
663
+ sentiment_dict = obj.polarity_scores(sentence)
664
+ print(sentiment_dict)
665
+
666
+ sentence = "Rahul is really bad"
667
+ sentiment_dict = obj.polarity_scores(sentence)
668
+ print(sentiment_dict)
669
+
670
+ #punctuation
671
+ print(obj.polarity_scores('Ram is good boy'))
672
+ print(obj.polarity_scores('Ram is good boy!'))
673
+ print(obj.polarity_scores('Ram is good boy!!'))
674
+
675
+ #capitalization
676
+ print(obj.polarity_scores('Ram is good'))
677
+ print(obj.polarity_scores('Ram is GOOD'))
678
+
679
+ #degree
680
+ print(obj.polarity_scores('Ram is good'))
681
+ print(obj.polarity_scores('Ram is better'))
682
+ print(obj.polarity_scores('Ram is best'))
683
+
684
+ print(obj.polarity_scores('Ram is bad'))
685
+ print(obj.polarity_scores('Ram is worse'))
686
+ print(obj.polarity_scores('Ram is worst'))
687
+
688
+ #conjuction
689
+ print(obj.polarity_scores('Ram is good'))
690
+ print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
691
+
692
+ #slang
693
+ print(obj.polarity_scores("That Hotel"))
694
+ print(obj.polarity_scores("That Hotel SUX"))
695
+ print(obj.polarity_scores("That Hotel SUCKS"))
696
+
697
+ #emoticons
698
+ print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
699
+ print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
700
+
701
+ print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
702
+ print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
703
+
704
+ #https://360digitmg.com/blog/bert-variants-and-their-differences
705
+ #https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
706
+
707
+ """#3.a Using FINBERT Model"""
708
+
709
+ #PPT
710
+ #https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
711
+
712
+ from transformers import BertTokenizer, BertForSequenceClassification, pipeline
713
+
714
+ # tested in transformers==4.18.0
715
+ import transformers
716
+ transformers.__version__
717
+
718
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
719
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
720
+
721
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
722
+ results = nlp(['growth is strong and we have plenty of liquidity.',
723
+ 'there is a shortage of capital, and we need extra financing.',
724
+ 'formulation patents might protect Vasotec to a limited extent.'])
725
+
726
+ results
727
+
728
+ """#FINBERT ESG"""
729
+
730
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
731
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
732
+
733
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
734
+ results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
735
+ 'Rhonda has been volunteering for several years for a variety of charitable community programs.',
736
+ 'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
737
+ 'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
738
+
739
+ results
740
+
741
+ """#FINBERT Classification"""
742
+
743
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
744
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
745
+
746
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
747
+ results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
748
+ 'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
749
+ 'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
750
+
751
+ results
752
+
753
+ X = df['Review Text'].to_list()
754
+ y = df['sentiment'].to_list()
755
+
756
+ from transformers import BertTokenizer, BertForSequenceClassification
757
+
758
+ finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
759
+ tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
760
+
761
+ labels = {0:'neutral', 1:'positive',2:'negative'}
762
+
763
+ sent_val = list()
764
+ for x in X:
765
+ inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
766
+ outputs = finbert_whole(**inputs)[0]
767
+
768
+ val = labels[np.argmax(outputs.detach().numpy())]
769
+ print(x, '---->', val)
770
+ print('#######################################################')
771
+ sent_val.append(val)
772
+
773
+ from sklearn.metrics import accuracy_score
774
+ print(accuracy_score(y, sent_val))
775
+
776
+ """#Using DISTILBERT"""
777
+
778
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
779
+
780
+ tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
781
+ model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
782
+
783
+ labels = {0:'neutral', 1:'positive',2:'negative'}
784
+
785
+ sent_val_bert = list()
786
+ for x in X:
787
+ inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
788
+ outputs = model_distilbert(**inputs)[0]
789
+
790
+ val = labels[np.argmax(outputs.detach().numpy())]
791
+ print(x, '---->', val)
792
+ print('#######################################################')
793
+ sent_val_bert.append(val)
794
+
795
+ from sklearn.metrics import accuracy_score
796
+ print(accuracy_score(y, sent_val))
797
+
798
+ """#Bert"""
799
+
800
+ tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
801
+ model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
802
+
803
+ labels = {0:'neutral', 1:'positive',2:'negative'}
804
+
805
+ sent_val_bert1 = list()
806
+ for x in X:
807
+ inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
808
+ outputs = model_bert(**inputs)[0]
809
+
810
+ val = labels[np.argmax(outputs.detach().numpy())]
811
+ print(x, '---->', val)
812
+ print('#######################################################')
813
+ sent_val_bert1.append(val)
814
+
815
+ from sklearn.metrics import accuracy_score
816
+ print(accuracy_score(y, sent_val))