|
|
|
"""Survey_Analysis_v_3_2_86.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS |
|
""" |
|
|
|
|
|
|
|
|
|
import streamlit |
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
|
|
import pygal as py |
|
import squarify as sq |
|
import matplotlib |
|
plt.rcParams["figure.figsize"] = (20,15) |
|
matplotlib.rc('xtick', labelsize=7) |
|
matplotlib.rc('ytick', labelsize=7) |
|
|
|
font = {'family' : 'normal', |
|
'weight' : 'bold', |
|
'size' : 5} |
|
|
|
matplotlib.rc('font', **font) |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import warnings |
|
warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
|
|
|
df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1") |
|
df |
|
|
|
col1=df.keys()[0] |
|
col2=df.keys()[1] |
|
col2 |
|
|
|
df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845]) |
|
|
|
df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False) |
|
|
|
df |
|
|
|
df = df.replace("neutral","neutral") |
|
|
|
sns.countplot(y="sentiment",data=df) |
|
|
|
df.isnull().sum() |
|
|
|
from textblob import TextBlob |
|
|
|
def preprocess(ReviewText): |
|
ReviewText = ReviewText.str.replace("(<br/>)", "") |
|
ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '') |
|
ReviewText = ReviewText.str.replace('(&)', '') |
|
ReviewText = ReviewText.str.replace('(>)', '') |
|
ReviewText = ReviewText.str.replace('(<)', '') |
|
ReviewText = ReviewText.str.replace('(\xa0)', ' ') |
|
return ReviewText |
|
df['Review Text'] = preprocess(df['news']) |
|
|
|
df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity) |
|
df['news_len'] = df['news'].astype(str).apply(len) |
|
df['word_count'] = df['news'].apply(lambda x: len(str(x).split())) |
|
|
|
df |
|
|
|
print('top 4 random reviews with the highest positive sentiment polarity: \n') |
|
|
|
df1=df.drop_duplicates(subset=['Review Text']) |
|
|
|
cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values |
|
for c in cl: |
|
print(c[0]) |
|
|
|
print('5 random reviews with the most neutral sentiment(zero) polarity: \n') |
|
cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values |
|
for c in cl1: |
|
print(c[0]) |
|
|
|
print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n') |
|
cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values |
|
for c in cl3: |
|
print(c[0]) |
|
|
|
sns.boxplot(df["polarity"],palette="rainbow",data=df) |
|
|
|
df['polarity'].plot( |
|
kind='hist', |
|
bins=50, |
|
color="peru", |
|
title='Sentiment Polarity Distribution');plt.show() |
|
|
|
p_s=df[df["polarity"]>0].count()["sentiment"] |
|
neu_s=df[df["polarity"]==0].count()["sentiment"] |
|
neg_s=df[df["polarity"]<0].count()["sentiment"] |
|
|
|
|
|
sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"] |
|
|
|
|
|
|
|
values = [p_s,neu_s,neg_s] |
|
|
|
|
|
colors = ['#FF0000', 'olive', '#FFFF00'] |
|
|
|
explode = (0.05, 0.05, 0.05) |
|
|
|
|
|
plt.pie(values, colors=colors, labels=sentiment, |
|
autopct='%1.1f%%', pctdistance=0.85, |
|
explode=explode) |
|
|
|
|
|
centre_circle = plt.Circle((0, 0), 0.70, fc='white') |
|
fig = plt.gcf() |
|
|
|
|
|
fig.gca().add_artist(centre_circle) |
|
|
|
|
|
plt.title('count of polarity as per sentiment') |
|
|
|
|
|
plt.show() |
|
|
|
df.plot.box(y=["word_count"],color="hotpink") |
|
|
|
df['word_count'].plot( |
|
kind='hist', |
|
bins=100, |
|
color="orange", |
|
title='Review Text Word Count Distribution');plt.show() |
|
|
|
sns.boxenplot(x="news_len",data=df) |
|
plt.show() |
|
|
|
df['news_len'].plot( |
|
kind='hist', |
|
bins=50, |
|
color="lightblue", |
|
title='Review Text Word Count Distribution');plt.show() |
|
|
|
fig = px.scatter(df, x="news_len", y="word_count", color="sentiment", |
|
marginal_x="box", marginal_y="violin", |
|
title="Click on the legend items!") |
|
fig.show() |
|
|
|
def get_top_n_words(corpus, n=None): |
|
vec = CountVectorizer().fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_words(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
|
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
|
kind='bar',title='Top 20 words in review before removing stop words') |
|
df1 |
|
|
|
def get_top_n_words(corpus, n=None): |
|
vec = CountVectorizer(stop_words = 'english').fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_words(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
|
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words') |
|
|
|
def get_top_n_bigram(corpus, n=None): |
|
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_bigram(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
|
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
|
kind='bar',title='Top 20 bigrams in review before removing stop words') |
|
|
|
def get_top_n_bigram(corpus, n=None): |
|
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_bigram(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
|
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
|
kind='bar', title='Top 20 bigrams in review after removing stop words') |
|
|
|
def get_top_n_trigram(corpus, n=None): |
|
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_trigram(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count']) |
|
df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
|
kind='bar', title='Top 20 trigrams in review before removing stop words') |
|
|
|
def get_top_n_trigram(corpus, n=None): |
|
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus) |
|
bag_of_words = vec.transform(corpus) |
|
sum_words = bag_of_words.sum(axis=0) |
|
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] |
|
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) |
|
return words_freq[:n] |
|
common_words = get_top_n_trigram(df['Review Text'], 20) |
|
for word, freq in common_words: |
|
print(word, freq) |
|
df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count']) |
|
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot( |
|
kind='bar', title='Top 20 trigrams in review after removing stop words') |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
nltk.download('wordnet') |
|
nltk.download('omw-1.4') |
|
nltk.download('averaged_perceptron_tagger') |
|
|
|
|
|
blob = TextBlob(str(df['Review Text'])) |
|
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos']) |
|
pos_df = pos_df.pos.value_counts()[:20] |
|
pos_df.plot( |
|
kind='bar', |
|
title='Top 20 Part-of-speech tagging for review corpus') |
|
|
|
y0 = df.loc[df['sentiment'] == 'positive']['polarity'] |
|
y1 = df.loc[df['sentiment'] == 'negative']['polarity'] |
|
y2 = df.loc[df['sentiment'] == 'neutral']['polarity'] |
|
|
|
trace0 = go.Box( |
|
y=y0, |
|
name = 'positive', |
|
marker = dict( |
|
color = 'rgb(214, 12, 140)', |
|
) |
|
) |
|
trace1 = go.Box( |
|
y=y1, |
|
name = 'negative', |
|
marker = dict( |
|
color = 'rgb(0, 128, 128)', |
|
) |
|
) |
|
trace2 = go.Box( |
|
y=y2, |
|
name = 'neutral', |
|
marker = dict( |
|
color = 'rgb(10, 140, 208)', |
|
) |
|
) |
|
data = [trace0, trace1, trace2] |
|
layout = go.Layout( |
|
title = "Polarity Boxplot according to sentiment" |
|
) |
|
|
|
go.Figure(data=data,layout=layout) |
|
|
|
y0 = df.loc[df['sentiment'] == 'positive']['news_len'] |
|
y1 = df.loc[df['sentiment'] == 'negative']['news_len'] |
|
y2 = df.loc[df['sentiment'] == 'neutral']['news_len'] |
|
|
|
|
|
trace0 = go.Box( |
|
y=y0, |
|
name = 'positive', |
|
marker = dict( |
|
color = 'rgb(214, 12, 140)', |
|
) |
|
) |
|
trace1 = go.Box( |
|
y=y1, |
|
name = 'negative', |
|
marker = dict( |
|
color = 'rgb(0, 128, 128)', |
|
) |
|
) |
|
trace2 = go.Box( |
|
y=y2, |
|
name = 'neutral', |
|
marker = dict( |
|
color = 'rgb(10, 140, 208)', |
|
) |
|
) |
|
data = [trace0, trace1, trace2] |
|
layout = go.Layout( |
|
title = "news length Boxplot by sentiment" |
|
) |
|
go.Figure(data=data,layout=layout) |
|
|
|
xp = df.loc[df['sentiment'] == "positive", 'polarity'] |
|
xneu = df.loc[df['sentiment'] == "neutral", 'polarity'] |
|
xneg= df.loc[df['sentiment'] == "negative", 'polarity'] |
|
|
|
trace1 = go.Histogram( |
|
x=xp, name='positive', |
|
opacity=0.75 |
|
) |
|
trace2 = go.Histogram( |
|
x=xneu, name = 'neutral', |
|
opacity=0.75 |
|
) |
|
trace3 = go.Histogram( |
|
x=xneg, name = 'negative', |
|
opacity=0.75 |
|
) |
|
data = [trace1, trace2,trace3] |
|
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity') |
|
go.Figure(data=data, layout=layout) |
|
|
|
trace1 = go.Scatter( |
|
x=df['polarity'], y=df['news_len'], mode='markers', name='points', |
|
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
|
) |
|
trace2 = go.Histogram2dContour( |
|
x=df['polarity'], y=df['news_len'], name='density', ncontours=50, |
|
colorscale='Hot', reversescale=True, showscale=False |
|
) |
|
trace3 = go.Histogram( |
|
x=df['polarity'], name='Sentiment polarity density', |
|
marker=dict(color='rgb(102,0,0)'), |
|
yaxis='y2' |
|
) |
|
trace4 = go.Histogram( |
|
y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'), |
|
xaxis='x2' |
|
) |
|
data = [trace1, trace2, trace3, trace4] |
|
|
|
layout = go.Layout( |
|
showlegend=False, |
|
autosize=False, |
|
width=600, |
|
height=550, |
|
xaxis=dict( |
|
domain=[0, 0.85], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
yaxis=dict( |
|
domain=[0, 0.85], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
margin=dict( |
|
t=50 |
|
), |
|
hovermode='x unified', |
|
bargap=0, |
|
xaxis2=dict( |
|
domain=[0.85, 1], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
yaxis2=dict( |
|
domain=[0.85, 1], |
|
showgrid=False, |
|
zeroline=False |
|
) |
|
) |
|
|
|
go.Figure(data=data, layout=layout) |
|
|
|
trace1 = go.Scatter( |
|
x=df['polarity'], y=df['word_count'], mode='markers', name='points', |
|
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4) |
|
) |
|
trace2 = go.Histogram2dContour( |
|
x=df['polarity'], y=df['word_count'], name='density', ncontours=20, |
|
colorscale='Hot', reversescale=True, showscale=False |
|
) |
|
trace3 = go.Histogram( |
|
x=df['polarity'], name='Sentiment polarity density', |
|
marker=dict(color='rgb(102,0,0)'), |
|
yaxis='y2' |
|
) |
|
trace4 = go.Histogram( |
|
y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'), |
|
xaxis='x2' |
|
) |
|
data = [trace1, trace2, trace3, trace4] |
|
|
|
layout = go.Layout( |
|
showlegend=False, |
|
autosize=False, |
|
width=600, |
|
height=550, |
|
xaxis=dict( |
|
domain=[0, 0.85], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
yaxis=dict( |
|
domain=[0, 0.85], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
margin=dict( |
|
t=50 |
|
), |
|
hovermode='closest', |
|
bargap=0, |
|
xaxis2=dict( |
|
domain=[0.85, 1], |
|
showgrid=False, |
|
zeroline=False |
|
), |
|
yaxis2=dict( |
|
domain=[0.85, 1], |
|
showgrid=False, |
|
zeroline=False |
|
) |
|
) |
|
|
|
go.Figure(data=data, layout=layout) |
|
|
|
|
|
|
|
|
|
|
|
import scattertext as st |
|
import spacy |
|
nlp = spacy.blank("en") |
|
nlp.add_pipe('sentencizer') |
|
|
|
corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build() |
|
print(list(corpus.get_scaled_f_scores_vs_background().index[:20])) |
|
|
|
term_freq_df = corpus.get_term_freq_df() |
|
term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive') |
|
list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20]) |
|
|
|
term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral') |
|
list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20]) |
|
|
|
term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative') |
|
list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20]) |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.decomposition import TruncatedSVD |
|
from collections import Counter |
|
|
|
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True) |
|
reindexed_data = df['Review Text'].values |
|
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data) |
|
n_topics = 10 |
|
lsa_model = TruncatedSVD(n_components=n_topics) |
|
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix) |
|
|
|
def get_keys(topic_matrix): |
|
''' |
|
returns an integer list of predicted topic |
|
categories for a given topic matrix |
|
''' |
|
keys = topic_matrix.argmax(axis=1).tolist() |
|
return keys |
|
|
|
def keys_to_counts(keys): |
|
''' |
|
returns a tuple of topic categories and their |
|
accompanying magnitudes for a given list of keys |
|
''' |
|
count_pairs = Counter(keys).items() |
|
categories = [pair[0] for pair in count_pairs] |
|
counts = [pair[1] for pair in count_pairs] |
|
return (categories, counts) |
|
|
|
lsa_keys = get_keys(lsa_topic_matrix) |
|
lsa_categories, lsa_counts = keys_to_counts(lsa_keys) |
|
|
|
def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer): |
|
''' |
|
returns a list of n_topic strings, where each string contains the n most common |
|
words in a predicted category, in order |
|
''' |
|
top_word_indices = [] |
|
for topic in range(n_topics): |
|
temp_vector_sum = 0 |
|
for i in range(len(keys)): |
|
if keys[i] == topic: |
|
temp_vector_sum += document_term_matrix[i] |
|
temp_vector_sum = temp_vector_sum.toarray() |
|
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0) |
|
top_word_indices.append(top_n_word_indices) |
|
top_words = [] |
|
for topic in top_word_indices: |
|
topic_words = [] |
|
for index in topic: |
|
temp_word_vector = np.zeros((1,document_term_matrix.shape[1])) |
|
temp_word_vector[:,index] = 1 |
|
the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0] |
|
topic_words.append(the_word.encode('ascii').decode('utf-8')) |
|
top_words.append(" ".join(topic_words)) |
|
return top_words |
|
|
|
top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
|
|
|
for i in range(len(top_lsa)): |
|
print("Topic {}: ".format(i+1), top_lsa[i]) |
|
|
|
top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer) |
|
labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories] |
|
fig, ax = plt.subplots(figsize=(16,8)) |
|
ax.bar(lsa_categories, lsa_counts,color="skyblue"); |
|
ax.set_xticks(lsa_categories,); |
|
ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive"); |
|
ax.set_ylabel('Number of review text on topics'); |
|
ax.set_title('Count of LSA topics'); |
|
plt.show(); |
|
|
|
"""#---2----""" |
|
|
|
df['sentiment'].value_counts() |
|
|
|
from sklearn.model_selection import train_test_split |
|
train,eva = train_test_split(df,test_size = 0.2) |
|
|
|
|
|
|
|
from simpletransformers.classification import ClassificationModel |
|
|
|
|
|
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False) |
|
|
|
|
|
def making_label(st): |
|
if(st=='positive'): |
|
return 0 |
|
elif(st=='neutral'): |
|
return 2 |
|
else: |
|
return 1 |
|
|
|
train['label'] = train['sentiment'].apply(making_label) |
|
eva['label'] = eva['sentiment'].apply(making_label) |
|
print(train.shape) |
|
|
|
train_df = pd.DataFrame({ |
|
'text': train['news'][:1500].replace(r'\n', ' ', regex=True), |
|
'label': train['label'][:1500] |
|
}) |
|
|
|
eval_df = pd.DataFrame({ |
|
'text': eva['news'][-400:].replace(r'\n', ' ', regex=True), |
|
'label': eva['label'][-400:] |
|
}) |
|
|
|
model.train_model(train_df) |
|
|
|
result, model_outputs, wrong_predictions = model.eval_model(eval_df) |
|
|
|
result |
|
|
|
model_outputs |
|
|
|
len(wrong_predictions) |
|
|
|
lst = [] |
|
for arr in model_outputs: |
|
lst.append(np.argmax(arr)) |
|
|
|
true = eval_df['label'].tolist() |
|
predicted = lst |
|
|
|
import sklearn |
|
mat = sklearn.metrics.confusion_matrix(true , predicted) |
|
mat |
|
|
|
df_cm = pd.DataFrame(mat, range(3), range(3)) |
|
|
|
sns.heatmap(df_cm, annot=True) |
|
plt.show() |
|
|
|
print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative'])) |
|
|
|
sklearn.metrics.accuracy_score(true,predicted) |
|
|
|
|
|
def get_result(statement): |
|
result = model.predict([statement]) |
|
pos = np.where(result[1][0] == np.amax(result[1][0])) |
|
pos = int(pos[0]) |
|
sentiment_dict = {0:'positive',1:'negative',2:'neutral'} |
|
print(sentiment_dict[pos]) |
|
return |
|
|
|
|
|
get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .") |
|
|
|
|
|
get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .") |
|
|
|
|
|
get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .') |
|
|
|
get_result("This company is growing like anything with 23% profit every year") |
|
|
|
get_result("This company is not able to make any profit but make very less profit in last quarter") |
|
|
|
get_result("The doctor treated well and the patient was very healthy") |
|
|
|
get_result("the act of politicians is to serve and help needy and not to create ruck suck") |
|
|
|
get_result("American burger is too good. Can't resisit to go and have one") |
|
|
|
get_result("GDP per capita increased to double in India from 2013") |
|
|
|
get_result("Indian economy is doing very good and will become super power one day.") |
|
|
|
get_result("Indian economy is doing very good and will create millions of jobs in coming years") |
|
|
|
get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years") |
|
|
|
get_result("Indian economy is doing very good.Indian economy is not doing very good ") |
|
|
|
get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy") |
|
|
|
get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export") |
|
|
|
get_result("The stock market of Indian economy is dangling too much") |
|
|
|
"""#VADER""" |
|
|
|
|
|
|
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer |
|
|
|
obj = SentimentIntensityAnalyzer() |
|
|
|
sentence = "Ram is really good " |
|
sentiment_dict = obj.polarity_scores(sentence) |
|
print(sentiment_dict) |
|
|
|
|
|
sentence = "Ram is better " |
|
sentiment_dict = obj.polarity_scores(sentence) |
|
print(sentiment_dict) |
|
|
|
sentence = "Rahul is really bad" |
|
sentiment_dict = obj.polarity_scores(sentence) |
|
print(sentiment_dict) |
|
|
|
|
|
print(obj.polarity_scores('Ram is good boy')) |
|
print(obj.polarity_scores('Ram is good boy!')) |
|
print(obj.polarity_scores('Ram is good boy!!')) |
|
|
|
|
|
print(obj.polarity_scores('Ram is good')) |
|
print(obj.polarity_scores('Ram is GOOD')) |
|
|
|
|
|
print(obj.polarity_scores('Ram is good')) |
|
print(obj.polarity_scores('Ram is better')) |
|
print(obj.polarity_scores('Ram is best')) |
|
|
|
print(obj.polarity_scores('Ram is bad')) |
|
print(obj.polarity_scores('Ram is worse')) |
|
print(obj.polarity_scores('Ram is worst')) |
|
|
|
|
|
print(obj.polarity_scores('Ram is good')) |
|
print(obj.polarity_scores('Ram is good, but he is also naughty sometimes')) |
|
|
|
|
|
print(obj.polarity_scores("That Hotel")) |
|
print(obj.polarity_scores("That Hotel SUX")) |
|
print(obj.polarity_scores("That Hotel SUCKS")) |
|
|
|
|
|
print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen")) |
|
print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen")) |
|
|
|
print(obj.polarity_scores("Your :( is the worst thing I have ever seen")) |
|
print(obj.polarity_scores("Your smile is the worst thing I have ever seen")) |
|
|
|
|
|
|
|
|
|
"""#3.a Using FINBERT Model""" |
|
|
|
|
|
|
|
|
|
from transformers import BertTokenizer, BertForSequenceClassification, pipeline |
|
|
|
|
|
import transformers |
|
transformers.__version__ |
|
|
|
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
|
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
|
|
|
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
|
results = nlp(['growth is strong and we have plenty of liquidity.', |
|
'there is a shortage of capital, and we need extra financing.', |
|
'formulation patents might protect Vasotec to a limited extent.']) |
|
|
|
results |
|
|
|
"""#FINBERT ESG""" |
|
|
|
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4) |
|
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg') |
|
|
|
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
|
results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.', |
|
'Rhonda has been volunteering for several years for a variety of charitable community programs.', |
|
'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.', |
|
'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.']) |
|
|
|
results |
|
|
|
"""#FINBERT Classification""" |
|
|
|
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3) |
|
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls') |
|
|
|
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer) |
|
results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.', |
|
'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.', |
|
'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in']) |
|
|
|
results |
|
|
|
X = df['Review Text'].to_list() |
|
y = df['sentiment'].to_list() |
|
|
|
from transformers import BertTokenizer, BertForSequenceClassification |
|
|
|
finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3) |
|
tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone') |
|
|
|
labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
|
sent_val = list() |
|
for x in X: |
|
inputs = tokenizer_whole(x, return_tensors="pt", padding=True) |
|
outputs = finbert_whole(**inputs)[0] |
|
|
|
val = labels[np.argmax(outputs.detach().numpy())] |
|
print(x, '---->', val) |
|
print('#######################################################') |
|
sent_val.append(val) |
|
|
|
from sklearn.metrics import accuracy_score |
|
print(accuracy_score(y, sent_val)) |
|
|
|
"""#Using DISTILBERT""" |
|
|
|
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification |
|
|
|
tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") |
|
model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") |
|
|
|
labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
|
sent_val_bert = list() |
|
for x in X: |
|
inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True) |
|
outputs = model_distilbert(**inputs)[0] |
|
|
|
val = labels[np.argmax(outputs.detach().numpy())] |
|
print(x, '---->', val) |
|
print('#######################################################') |
|
sent_val_bert.append(val) |
|
|
|
from sklearn.metrics import accuracy_score |
|
print(accuracy_score(y, sent_val)) |
|
|
|
"""#Bert""" |
|
|
|
tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased") |
|
model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased") |
|
|
|
labels = {0:'neutral', 1:'positive',2:'negative'} |
|
|
|
sent_val_bert1 = list() |
|
for x in X: |
|
inputs = tokenizer_bert(x, return_tensors="pt", padding=True) |
|
outputs = model_bert(**inputs)[0] |
|
|
|
val = labels[np.argmax(outputs.detach().numpy())] |
|
print(x, '---->', val) |
|
print('#######################################################') |
|
sent_val_bert1.append(val) |
|
|
|
from sklearn.metrics import accuracy_score |
|
print(accuracy_score(y, sent_val)) |