BulatF commited on
Commit
6a00b4d
·
1 Parent(s): 732f6c3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -54
app.py CHANGED
@@ -6,12 +6,11 @@ from fuzzywuzzy import fuzz
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  import torch.nn.functional as F
8
  import torch
9
- import os
10
  import io
11
  import base64
12
  from stqdm import stqdm
13
  import nltk
14
- import tempfile
15
  from nltk.corpus import stopwords
16
  nltk.download('stopwords')
17
  import matplotlib.pyplot as plt
@@ -31,19 +30,12 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
31
 
32
 
33
  #defs
34
- def classify_reviews(reviews, batch_size=64):
35
- probabilities = []
36
-
37
- for i in range(0, len(reviews), batch_size):
38
- batch_reviews = reviews[i:i+batch_size]
39
- inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
40
- outputs = model(**inputs)
41
- batch_probabilities = F.softmax(outputs.logits, dim=1).tolist()
42
- probabilities.extend(batch_probabilities)
43
-
44
  return probabilities
45
 
46
-
47
  def top_rating(scores):
48
  return scores.index(max(scores)) + 1
49
 
@@ -70,17 +62,15 @@ def process_filter_words(filter_words_input):
70
 
71
 
72
  # Function for classifying with the new model
73
- def classify_with_new_classes(reviews, class_names, batch_size=64):
74
  class_scores = []
75
 
76
- for i in range(0, len(reviews), batch_size):
77
- batch_reviews = reviews[i:i+batch_size]
78
- for review in batch_reviews:
79
- result = classifier(review, class_names)
80
- scores_dict = dict(zip(result['labels'], result['scores']))
81
- # Reorder scores to match the original class_names order
82
- scores = [scores_dict[name] for name in class_names]
83
- class_scores.append(scores)
84
 
85
  return class_scores
86
 
@@ -90,43 +80,36 @@ def main():
90
  st.title('Sentiment Analysis')
91
  st.markdown('Upload an Excel file to get sentiment analytics')
92
 
93
- uploaded_file = st.file_uploader("Upload an excel file", type=['xlsx'], key='unique1')
94
  review_column = None
95
  df = None
96
  class_names = None # New variable for class names
97
 
98
- if uploaded_file is not None:
99
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
100
- tmp.write(uploaded_file.getvalue())
101
- tmp_file_name = tmp.name
102
-
103
  try:
104
- # Reading Excel file in chunks
105
- chunk_size = 1000 # you can adjust this value according to your memory
106
- chunks = []
107
- for chunk in pd.read_excel(tmp_file_name, chunksize=chunk_size):
108
- chunk = chunk.dropna(how='all')
109
- chunk = chunk.replace(r'^\s*$', np.nan, regex=True)
110
- chunk = chunk.dropna(how='all')
111
- review_column = st.selectbox('Select the column from your excel file containing text', chunk.columns)
112
- chunk[review_column] = chunk[review_column].astype(str)
113
-
114
- filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)', key='filter words') # New input field for filter words
115
- filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
116
- class_names = st.text_input('Enter the possible class names separated by comma', key='class names') # New input field for class names
117
- chunk = filter_dataframe(chunk, review_column, filter_words) # Filter the DataFrame
118
-
119
- chunks.append(chunk)
120
 
121
- df = pd.concat(chunks, ignore_index=True)
 
 
 
122
  except Exception as e:
123
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
124
  return
125
 
126
- start_button = st.button('Start Analysis', key='unique2')
127
 
128
 
129
  if start_button and df is not None:
 
130
  df = df[df[review_column].notna()]
131
  df = df[df[review_column].str.strip() != '']
132
 
@@ -150,27 +133,24 @@ def main():
150
 
151
 
152
 
153
- def process_reviews(df, review_column, class_names, batch_size=64):
154
  with st.spinner('Classifying reviews...'):
155
  progress_bar = st.progress(0)
156
  total_reviews = len(df[review_column].tolist())
157
  review_counter = 0
158
 
 
159
  raw_scores = []
160
  reviews = df[review_column].tolist()
161
  for i in range(0, len(reviews), batch_size):
162
  batch_reviews = reviews[i:i+batch_size]
163
- batch_scores = classify_reviews(batch_reviews, batch_size)
164
  raw_scores.extend(batch_scores)
165
  review_counter += len(batch_reviews)
166
  progress_bar.progress(review_counter / total_reviews)
167
 
168
  with st.spinner('Generating classes...'):
169
- class_scores = []
170
- for i in range(0, len(reviews), batch_size):
171
- batch_reviews = reviews[i:i+batch_size]
172
- batch_scores = classify_with_new_classes(batch_reviews, class_names, batch_size)
173
- class_scores.extend(batch_scores)
174
 
175
  class_scores_dict = {} # New dictionary to store class scores
176
  for i, name in enumerate(class_names):
@@ -181,6 +161,7 @@ def process_reviews(df, review_column, class_names, batch_size=64):
181
  if class_names and not all(name.isspace() for name in class_names):
182
  df['Highest Class'] = df[class_names].idxmax(axis=1)
183
 
 
184
  df_new = df.copy()
185
  df_new['raw_scores'] = raw_scores
186
  scores_to_df(df_new)
@@ -200,7 +181,6 @@ def process_reviews(df, review_column, class_names, batch_size=64):
200
 
201
 
202
 
203
-
204
  def scores_to_df(df):
205
  for i in range(1, 6):
206
  df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
@@ -269,6 +249,8 @@ def display_ratings(df, review_column):
269
  cols[i-1].markdown("No important words to display")
270
 
271
 
 
 
272
 
273
  if __name__ == "__main__":
274
  main()
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  import torch.nn.functional as F
8
  import torch
 
9
  import io
10
  import base64
11
  from stqdm import stqdm
12
  import nltk
13
+
14
  from nltk.corpus import stopwords
15
  nltk.download('stopwords')
16
  import matplotlib.pyplot as plt
 
30
 
31
 
32
  #defs
33
+ def classify_reviews(reviews):
34
+ inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
35
+ outputs = model(**inputs)
36
+ probabilities = F.softmax(outputs.logits, dim=1).tolist()
 
 
 
 
 
 
37
  return probabilities
38
 
 
39
  def top_rating(scores):
40
  return scores.index(max(scores)) + 1
41
 
 
62
 
63
 
64
  # Function for classifying with the new model
65
+ def classify_with_new_classes(reviews, class_names):
66
  class_scores = []
67
 
68
+ for review in reviews:
69
+ result = classifier(review, class_names)
70
+ scores_dict = dict(zip(result['labels'], result['scores']))
71
+ # Reorder scores to match the original class_names order
72
+ scores = [scores_dict[name] for name in class_names]
73
+ class_scores.append(scores)
 
 
74
 
75
  return class_scores
76
 
 
80
  st.title('Sentiment Analysis')
81
  st.markdown('Upload an Excel file to get sentiment analytics')
82
 
83
+ file = st.file_uploader("Upload an excel file", type=['xlsx'])
84
  review_column = None
85
  df = None
86
  class_names = None # New variable for class names
87
 
88
+ if file is not None:
 
 
 
 
89
  try:
90
+ df = pd.read_excel(file)
91
+ # Drop rows where all columns are NaN
92
+ df = df.dropna(how='all')
93
+ # Replace blank spaces with NaN, then drop rows where all columns are NaN again
94
+ df = df.replace(r'^\s*$', np.nan, regex=True)
95
+ df = df.dropna(how='all')
96
+ review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
97
+ df[review_column] = df[review_column].astype(str)
98
+
 
 
 
 
 
 
 
99
 
100
+ filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)') # New input field for filter words
101
+ filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
102
+ class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names
103
+ df = filter_dataframe(df, review_column, filter_words) # Filter the DataFrame
104
  except Exception as e:
105
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
106
  return
107
 
108
+ start_button = st.button('Start Analysis')
109
 
110
 
111
  if start_button and df is not None:
112
+ # Drop rows with NaN or blank values in the review_column
113
  df = df[df[review_column].notna()]
114
  df = df[df[review_column].str.strip() != '']
115
 
 
133
 
134
 
135
 
136
+ def process_reviews(df, review_column, class_names):
137
  with st.spinner('Classifying reviews...'):
138
  progress_bar = st.progress(0)
139
  total_reviews = len(df[review_column].tolist())
140
  review_counter = 0
141
 
142
+ batch_size = 50
143
  raw_scores = []
144
  reviews = df[review_column].tolist()
145
  for i in range(0, len(reviews), batch_size):
146
  batch_reviews = reviews[i:i+batch_size]
147
+ batch_scores = classify_reviews(batch_reviews)
148
  raw_scores.extend(batch_scores)
149
  review_counter += len(batch_reviews)
150
  progress_bar.progress(review_counter / total_reviews)
151
 
152
  with st.spinner('Generating classes...'):
153
+ class_scores = classify_with_new_classes(df[review_column].tolist(), class_names)
 
 
 
 
154
 
155
  class_scores_dict = {} # New dictionary to store class scores
156
  for i, name in enumerate(class_names):
 
161
  if class_names and not all(name.isspace() for name in class_names):
162
  df['Highest Class'] = df[class_names].idxmax(axis=1)
163
 
164
+
165
  df_new = df.copy()
166
  df_new['raw_scores'] = raw_scores
167
  scores_to_df(df_new)
 
181
 
182
 
183
 
 
184
  def scores_to_df(df):
185
  for i in range(1, 6):
186
  df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
 
249
  cols[i-1].markdown("No important words to display")
250
 
251
 
252
+
253
+
254
 
255
  if __name__ == "__main__":
256
  main()