Spaces:
Runtime error
Runtime error
Upload app.py
Browse files
app.py
CHANGED
@@ -6,12 +6,11 @@ from fuzzywuzzy import fuzz
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
import torch.nn.functional as F
|
8 |
import torch
|
9 |
-
import os
|
10 |
import io
|
11 |
import base64
|
12 |
from stqdm import stqdm
|
13 |
import nltk
|
14 |
-
|
15 |
from nltk.corpus import stopwords
|
16 |
nltk.download('stopwords')
|
17 |
import matplotlib.pyplot as plt
|
@@ -31,19 +30,12 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
|
|
31 |
|
32 |
|
33 |
#defs
|
34 |
-
def classify_reviews(reviews
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
batch_reviews = reviews[i:i+batch_size]
|
39 |
-
inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
40 |
-
outputs = model(**inputs)
|
41 |
-
batch_probabilities = F.softmax(outputs.logits, dim=1).tolist()
|
42 |
-
probabilities.extend(batch_probabilities)
|
43 |
-
|
44 |
return probabilities
|
45 |
|
46 |
-
|
47 |
def top_rating(scores):
|
48 |
return scores.index(max(scores)) + 1
|
49 |
|
@@ -70,17 +62,15 @@ def process_filter_words(filter_words_input):
|
|
70 |
|
71 |
|
72 |
# Function for classifying with the new model
|
73 |
-
def classify_with_new_classes(reviews, class_names
|
74 |
class_scores = []
|
75 |
|
76 |
-
for
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
scores = [scores_dict[name] for name in class_names]
|
83 |
-
class_scores.append(scores)
|
84 |
|
85 |
return class_scores
|
86 |
|
@@ -90,43 +80,36 @@ def main():
|
|
90 |
st.title('Sentiment Analysis')
|
91 |
st.markdown('Upload an Excel file to get sentiment analytics')
|
92 |
|
93 |
-
|
94 |
review_column = None
|
95 |
df = None
|
96 |
class_names = None # New variable for class names
|
97 |
|
98 |
-
if
|
99 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
|
100 |
-
tmp.write(uploaded_file.getvalue())
|
101 |
-
tmp_file_name = tmp.name
|
102 |
-
|
103 |
try:
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)', key='filter words') # New input field for filter words
|
115 |
-
filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
|
116 |
-
class_names = st.text_input('Enter the possible class names separated by comma', key='class names') # New input field for class names
|
117 |
-
chunk = filter_dataframe(chunk, review_column, filter_words) # Filter the DataFrame
|
118 |
-
|
119 |
-
chunks.append(chunk)
|
120 |
|
121 |
-
|
|
|
|
|
|
|
122 |
except Exception as e:
|
123 |
st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
|
124 |
return
|
125 |
|
126 |
-
start_button = st.button('Start Analysis'
|
127 |
|
128 |
|
129 |
if start_button and df is not None:
|
|
|
130 |
df = df[df[review_column].notna()]
|
131 |
df = df[df[review_column].str.strip() != '']
|
132 |
|
@@ -150,27 +133,24 @@ def main():
|
|
150 |
|
151 |
|
152 |
|
153 |
-
def process_reviews(df, review_column, class_names
|
154 |
with st.spinner('Classifying reviews...'):
|
155 |
progress_bar = st.progress(0)
|
156 |
total_reviews = len(df[review_column].tolist())
|
157 |
review_counter = 0
|
158 |
|
|
|
159 |
raw_scores = []
|
160 |
reviews = df[review_column].tolist()
|
161 |
for i in range(0, len(reviews), batch_size):
|
162 |
batch_reviews = reviews[i:i+batch_size]
|
163 |
-
batch_scores = classify_reviews(batch_reviews
|
164 |
raw_scores.extend(batch_scores)
|
165 |
review_counter += len(batch_reviews)
|
166 |
progress_bar.progress(review_counter / total_reviews)
|
167 |
|
168 |
with st.spinner('Generating classes...'):
|
169 |
-
class_scores = []
|
170 |
-
for i in range(0, len(reviews), batch_size):
|
171 |
-
batch_reviews = reviews[i:i+batch_size]
|
172 |
-
batch_scores = classify_with_new_classes(batch_reviews, class_names, batch_size)
|
173 |
-
class_scores.extend(batch_scores)
|
174 |
|
175 |
class_scores_dict = {} # New dictionary to store class scores
|
176 |
for i, name in enumerate(class_names):
|
@@ -181,6 +161,7 @@ def process_reviews(df, review_column, class_names, batch_size=64):
|
|
181 |
if class_names and not all(name.isspace() for name in class_names):
|
182 |
df['Highest Class'] = df[class_names].idxmax(axis=1)
|
183 |
|
|
|
184 |
df_new = df.copy()
|
185 |
df_new['raw_scores'] = raw_scores
|
186 |
scores_to_df(df_new)
|
@@ -200,7 +181,6 @@ def process_reviews(df, review_column, class_names, batch_size=64):
|
|
200 |
|
201 |
|
202 |
|
203 |
-
|
204 |
def scores_to_df(df):
|
205 |
for i in range(1, 6):
|
206 |
df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
|
@@ -269,6 +249,8 @@ def display_ratings(df, review_column):
|
|
269 |
cols[i-1].markdown("No important words to display")
|
270 |
|
271 |
|
|
|
|
|
272 |
|
273 |
if __name__ == "__main__":
|
274 |
main()
|
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
import torch.nn.functional as F
|
8 |
import torch
|
|
|
9 |
import io
|
10 |
import base64
|
11 |
from stqdm import stqdm
|
12 |
import nltk
|
13 |
+
|
14 |
from nltk.corpus import stopwords
|
15 |
nltk.download('stopwords')
|
16 |
import matplotlib.pyplot as plt
|
|
|
30 |
|
31 |
|
32 |
#defs
|
33 |
+
def classify_reviews(reviews):
|
34 |
+
inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
35 |
+
outputs = model(**inputs)
|
36 |
+
probabilities = F.softmax(outputs.logits, dim=1).tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return probabilities
|
38 |
|
|
|
39 |
def top_rating(scores):
|
40 |
return scores.index(max(scores)) + 1
|
41 |
|
|
|
62 |
|
63 |
|
64 |
# Function for classifying with the new model
|
65 |
+
def classify_with_new_classes(reviews, class_names):
|
66 |
class_scores = []
|
67 |
|
68 |
+
for review in reviews:
|
69 |
+
result = classifier(review, class_names)
|
70 |
+
scores_dict = dict(zip(result['labels'], result['scores']))
|
71 |
+
# Reorder scores to match the original class_names order
|
72 |
+
scores = [scores_dict[name] for name in class_names]
|
73 |
+
class_scores.append(scores)
|
|
|
|
|
74 |
|
75 |
return class_scores
|
76 |
|
|
|
80 |
st.title('Sentiment Analysis')
|
81 |
st.markdown('Upload an Excel file to get sentiment analytics')
|
82 |
|
83 |
+
file = st.file_uploader("Upload an excel file", type=['xlsx'])
|
84 |
review_column = None
|
85 |
df = None
|
86 |
class_names = None # New variable for class names
|
87 |
|
88 |
+
if file is not None:
|
|
|
|
|
|
|
|
|
89 |
try:
|
90 |
+
df = pd.read_excel(file)
|
91 |
+
# Drop rows where all columns are NaN
|
92 |
+
df = df.dropna(how='all')
|
93 |
+
# Replace blank spaces with NaN, then drop rows where all columns are NaN again
|
94 |
+
df = df.replace(r'^\s*$', np.nan, regex=True)
|
95 |
+
df = df.dropna(how='all')
|
96 |
+
review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
|
97 |
+
df[review_column] = df[review_column].astype(str)
|
98 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)') # New input field for filter words
|
101 |
+
filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
|
102 |
+
class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names
|
103 |
+
df = filter_dataframe(df, review_column, filter_words) # Filter the DataFrame
|
104 |
except Exception as e:
|
105 |
st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
|
106 |
return
|
107 |
|
108 |
+
start_button = st.button('Start Analysis')
|
109 |
|
110 |
|
111 |
if start_button and df is not None:
|
112 |
+
# Drop rows with NaN or blank values in the review_column
|
113 |
df = df[df[review_column].notna()]
|
114 |
df = df[df[review_column].str.strip() != '']
|
115 |
|
|
|
133 |
|
134 |
|
135 |
|
136 |
+
def process_reviews(df, review_column, class_names):
|
137 |
with st.spinner('Classifying reviews...'):
|
138 |
progress_bar = st.progress(0)
|
139 |
total_reviews = len(df[review_column].tolist())
|
140 |
review_counter = 0
|
141 |
|
142 |
+
batch_size = 50
|
143 |
raw_scores = []
|
144 |
reviews = df[review_column].tolist()
|
145 |
for i in range(0, len(reviews), batch_size):
|
146 |
batch_reviews = reviews[i:i+batch_size]
|
147 |
+
batch_scores = classify_reviews(batch_reviews)
|
148 |
raw_scores.extend(batch_scores)
|
149 |
review_counter += len(batch_reviews)
|
150 |
progress_bar.progress(review_counter / total_reviews)
|
151 |
|
152 |
with st.spinner('Generating classes...'):
|
153 |
+
class_scores = classify_with_new_classes(df[review_column].tolist(), class_names)
|
|
|
|
|
|
|
|
|
154 |
|
155 |
class_scores_dict = {} # New dictionary to store class scores
|
156 |
for i, name in enumerate(class_names):
|
|
|
161 |
if class_names and not all(name.isspace() for name in class_names):
|
162 |
df['Highest Class'] = df[class_names].idxmax(axis=1)
|
163 |
|
164 |
+
|
165 |
df_new = df.copy()
|
166 |
df_new['raw_scores'] = raw_scores
|
167 |
scores_to_df(df_new)
|
|
|
181 |
|
182 |
|
183 |
|
|
|
184 |
def scores_to_df(df):
|
185 |
for i in range(1, 6):
|
186 |
df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
|
|
|
249 |
cols[i-1].markdown("No important words to display")
|
250 |
|
251 |
|
252 |
+
|
253 |
+
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
main()
|