File size: 1,506 Bytes
0db5ea1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as pe
from wordcloud import WordCloud, STOPWORDS
from nltk import FreqDist

st.set_option('deprecation.showPyplotGlobalUse', False)



st.title("Welcome To The Explore Page: ")

st.markdown("On this page you will be able to some EDA Visuals")

##loading my dataset 

data= pd.read_csv("datasets/Train.csv")

clean_data= pd.read_csv("datasets/clean_copy.csv")

clean_data= clean_data.dropna()

##plotting my wordcloud for the unclean dataset

unclean_words= " ".join(data["safe_text"])

wc= WordCloud(stopwords=STOPWORDS).generate(unclean_words)

plt.figure(figsize= (5,10))
plt.title("Most common Words in unclean Dataset")
plt.imshow(wc)
st.pyplot()

##creating a wordcloud of my most common word in cleaned tweet
clean_words= ' '.join(clean_data["clean_tweet"]).split() ##converting the dataframe to corpus of words

freq_words= pd.DataFrame(FreqDist(clean_words).most_common(20), columns= ["word", "count"])

fig= pe.treemap(data_frame=freq_words, path=["word"], values= "count", title= "Top 20 Most Frequent Words After Cleaning")

st.plotly_chart(fig)


##getting the tweet lengths
data["tweet_length"]= [len(i.split(" ")) for i in data["safe_text"]]

words= data["tweet_length"].value_counts().reset_index()
fig_2= pe.scatter(data_frame=words, x="tweet_length", y="count", size= "count", color= "tweet_length", title= "Tweet Lenghts")

st.plotly_chart(fig_2)