Spaces:
Sleeping
Sleeping
File size: 7,362 Bytes
ee275ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from io import BytesIO
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import chardet
import os
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
# Function to analyze sentiment
def analyze_sentiment(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
scores = outputs.logits.softmax(dim=1)
labels = ['NEGATIVE', 'POSITIVE']
score, label = torch.max(scores, dim=1)
return {"label": labels[label.item()], "score": score.item()}
# Function to detect file encoding
def detect_encoding(file):
rawdata = file.read()
result = chardet.detect(rawdata)
return result['encoding']
def generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path):
pdf_output = BytesIO()
pdf_height = 16.5 * inch # Total vertical height calculated
pdf_width = 8.27 * inch # A4 width
c = canvas.Canvas(pdf_output, pagesize=(pdf_width, pdf_height))
# Set starting vertical position
y_position = pdf_height - 1 * inch
# Add title
c.setFont("Helvetica-Bold", 20)
c.drawString(2.2 * inch, y_position, "Sentiment Analysis Report")
# Update vertical position after title
y_position -= 2 * inch
# Add pie chart with width 5 inches and height double the width
pie_chart_width = 5 * inch
pie_chart_height = 4 * inch
c.drawImage(pie_chart_path, 1.5 * inch, y_position - pie_chart_height, width=pie_chart_width, height=pie_chart_height)
# Update vertical position after pie chart
y_position -= (pie_chart_height + 1 * inch) # Add some spacing
# Add Positive Keywords heading
c.setFont("Helvetica-Bold", 12)
c.drawString(3 * inch, y_position, "Positive Keywords")
# Add positive word cloud
c.drawImage(pos_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio
# Update vertical position after positive word cloud
y_position -= (3 * inch + 1 * inch) # Add some spacing
# Add Negative Keywords heading
c.setFont("Helvetica-Bold", 12)
c.drawString(3 * inch, y_position, "Negative Keywords")
# Add negative word cloud
c.drawImage(neg_wordcloud_path, 1 * inch, y_position - 3.3 * inch, width=6 * inch, height=3 * inch) # 2:1 ratio
c.save()
pdf_output.seek(0)
return pdf_output
# Streamlit UI
st.title("Sentiment Analysis and Reporting")
# Initialize session state for button visibility
if 'show_pdf_download' not in st.session_state:
st.session_state.show_pdf_download = False
# Sidebar for encoding detection and reset button
st.sidebar.header("File Encoding Checker")
# File uploader in the sidebar
uploaded_file = st.sidebar.file_uploader("Upload CSV file for Encoding Check", type=["csv"])
if uploaded_file:
# Detect the encoding
encoding = detect_encoding(uploaded_file)
st.sidebar.write(f"Detected encoding: {encoding}")
# Reset button in the sidebar
if st.sidebar.button("Reset Analysis"):
if os.path.exists("sentiment_pie_chart.png"):
os.remove("sentiment_pie_chart.png")
if os.path.exists("pos_wordcloud.png"):
os.remove("pos_wordcloud.png")
if os.path.exists("neg_wordcloud.png"):
os.remove("neg_wordcloud.png")
st.sidebar.write("Files deleted. Please re-upload a file to start over.")
# File uploader for sentiment analysis
uploaded_file = st.file_uploader("Upload CSV file for Sentiment Analysis", type=["csv"])
# Dropdown for encoding specification in the main panel
encodings = ['utf-8', 'latin-1', 'ISO-8859-1', 'ASCII', 'UTF-16', 'UTF-32', 'ANSI', "Windows-1251", 'Windows-1252']
user_encoding = st.selectbox("Select Encoding", options=encodings, index=0)
# Button to start processing
if st.button("Go"):
if uploaded_file:
try:
# Load the CSV file into DataFrame with specified encoding
uploaded_file.seek(0) # Reset the file pointer to the beginning
df = pd.read_csv(uploaded_file, encoding=user_encoding)
except UnicodeDecodeError:
st.error("Error decoding the file. Please specify the correct encoding.")
else:
# Check if the DataFrame has exactly one column
if df.shape[1] != 1:
st.warning("The CSV file should only contain one column with review data.")
else:
# Rename the column to 'review'
df.columns = ['review']
# Clean up the DataFrame
df['review'] = df['review'].astype(str).str.strip()
df = df[df['review'].apply(len) <= 512]
# Apply sentiment analysis
df['sentiment'] = df['review'].apply(analyze_sentiment)
df['sentiment_label'] = df['sentiment'].apply(lambda x: x['label'])
df['sentiment_score'] = df['sentiment'].apply(lambda x: x['score'])
# Drop the original 'sentiment' column
df = df.drop(columns=['sentiment'])
# Pie chart data
sentiment_counts = df['sentiment_label'].value_counts()
# Create pie chart
fig, ax = plt.subplots()
ax.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=45)
ax.set_title('Distribution of Sentiment')
pie_chart_path = "sentiment_pie_chart.png"
plt.savefig(pie_chart_path)
# Create word clouds
stopwords = set(STOPWORDS)
pos_reviews = df[df['sentiment_label'] == 'POSITIVE']['review'].str.cat(sep=' ')
neg_reviews = df[df['sentiment_label'] == 'NEGATIVE']['review'].str.cat(sep=' ')
pos_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(pos_reviews)
neg_wordcloud = WordCloud(max_font_size=80, max_words=10, background_color='white', stopwords=stopwords).generate(neg_reviews)
# Save word clouds to files
pos_wordcloud_path = "pos_wordcloud.png"
neg_wordcloud_path = "neg_wordcloud.png"
pos_wordcloud.to_file(pos_wordcloud_path)
neg_wordcloud.to_file(neg_wordcloud_path)
# Create PDF
pdf_output = generate_pdf(pie_chart_path, pos_wordcloud_path, neg_wordcloud_path)
# Display options
st.write("Processing complete!")
# Update session state to show the appropriate buttons
st.session_state.show_pdf_download = True
# Display buttons
download_pdf = st.download_button("Download PDF Report", pdf_output, file_name="sentiment_analysis_report.pdf", mime="application/pdf")
else:
st.info("Please upload a CSV file to get started.")
|