BulatF commited on
Commit
0f8f9a1
·
1 Parent(s): 9052e90

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -65
app.py CHANGED
@@ -16,29 +16,17 @@ nltk.download('stopwords')
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
19
- from lime.lime_text import LimeTextExplainer
20
- from lime import lime_text
21
-
22
-
23
  stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
24
- st.set_page_config(layout="wide")
25
- @st.cache_resource
26
- def load_model_and_tokenizer(model_name):
27
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
28
- tokenizer = AutoTokenizer.from_pretrained(model_name)
29
- return model, tokenizer
30
-
31
- model, tokenizer = load_model_and_tokenizer('nlptown/bert-base-multilingual-uncased-sentiment')
32
-
33
- @st.cache_resource
34
- def load_pipeline():
35
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
36
- return classifier
37
-
38
- classifier = load_pipeline()
39
 
 
 
 
 
 
40
 
 
41
 
 
42
 
43
 
44
  #defs
@@ -95,32 +83,38 @@ def main():
95
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
96
  review_column = None
97
  df = None
98
- class_names = None
99
 
100
  if file is not None:
101
  try:
102
  df = pd.read_excel(file)
 
103
  df = df.dropna(how='all')
 
104
  df = df.replace(r'^\s*$', np.nan, regex=True)
105
  df = df.dropna(how='all')
106
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
107
  df[review_column] = df[review_column].astype(str)
108
 
109
- filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')
110
- filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)
111
- class_names = st.text_input('Enter the possible class names separated by comma')
112
- df = filter_dataframe(df, review_column, filter_words)
 
113
  except Exception as e:
114
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
115
  return
116
 
117
  start_button = st.button('Start Analysis')
118
 
 
119
  if start_button and df is not None:
 
120
  df = df[df[review_column].notna()]
121
  df = df[df[review_column].str.strip() != '']
122
- class_names = [name.strip() for name in class_names.split(',')]
123
- for name in class_names:
 
124
  if name not in df.columns:
125
  df[name] = 0.0
126
 
@@ -128,11 +122,10 @@ def main():
128
  with st.spinner('Performing sentiment analysis...'):
129
  df, df_display = process_reviews(df, review_column, class_names)
130
 
131
- display_ratings(df, review_column)
132
  display_dataframe(df, df_display)
133
  else:
134
- st.write("The selected review column doesn't exist in the dataframe")
135
-
136
 
137
 
138
 
@@ -226,46 +219,22 @@ def display_dataframe(df, df_display):
226
 
227
  st.dataframe(df_display)
228
 
229
- def important_words(reviews, model, num_words=5, batch_size=50):
230
- # Create a LimeTextExplainer
231
- explainer = LimeTextExplainer(class_names=[str(i) for i in range(1, 6)])
232
-
233
- # Define a prediction function that takes a list of texts and outputs a prediction matrix
234
- def predict_proba(texts):
235
- inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
236
- outputs = model(**inputs)
237
- probabilities = F.softmax(outputs.logits, dim=1).detach().numpy()
238
- return probabilities
239
-
240
- important_words_per_rating = {}
241
-
242
- for rating in range(1, 6):
243
- important_words_per_rating[rating] = []
244
-
245
- # Batch processing
246
- for i in range(0, len(reviews), batch_size):
247
- batch_reviews = reviews[i:i+batch_size]
248
- for review in batch_reviews:
249
- # Get the explanation for the review
250
- explanation = explainer.explain_instance(review, predict_proba, num_features=num_words, labels=[rating - 1])
251
-
252
- # Get the list of important words
253
- words = [feature[0] for feature in explanation.as_list(rating - 1)]
254
- important_words_per_rating[rating].extend(words)
255
-
256
- # Keep only unique words
257
- important_words_per_rating[rating] = list(set(important_words_per_rating[rating]))
258
-
259
- return important_words_per_rating
260
-
261
-
262
 
263
  def display_ratings(df, review_column):
264
  cols = st.columns(5)
265
 
266
  for i in range(1, 6):
267
  rating_reviews = df[df['Rating'] == i][review_column]
268
- top_words = important_words(rating_reviews, model)
269
 
270
  rating_counts = rating_reviews.shape[0]
271
  cols[i-1].markdown(f"### {rating_counts}")
@@ -274,13 +243,12 @@ def display_ratings(df, review_column):
274
  # Display the most important words for each rating
275
  cols[i-1].markdown(f"#### Most Important Words:")
276
  if top_words:
277
- for word in top_words[i]:
278
  cols[i-1].markdown(f"**{word}**")
279
  else:
280
  cols[i-1].markdown("No important words to display")
281
 
282
 
283
-
284
 
285
 
286
 
 
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
 
 
 
 
19
  stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Define the model and tokenizer
22
+ model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ st.set_page_config(layout="wide")
26
 
27
+ # Import the new model and tokenizer
28
 
29
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
30
 
31
 
32
  #defs
 
83
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
84
  review_column = None
85
  df = None
86
+ class_names = None # New variable for class names
87
 
88
  if file is not None:
89
  try:
90
  df = pd.read_excel(file)
91
+ # Drop rows where all columns are NaN
92
  df = df.dropna(how='all')
93
+ # Replace blank spaces with NaN, then drop rows where all columns are NaN again
94
  df = df.replace(r'^\s*$', np.nan, regex=True)
95
  df = df.dropna(how='all')
96
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
97
  df[review_column] = df[review_column].astype(str)
98
 
99
+
100
+ filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)') # New input field for filter words
101
+ filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
102
+ class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names
103
+ df = filter_dataframe(df, review_column, filter_words) # Filter the DataFrame
104
  except Exception as e:
105
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
106
  return
107
 
108
  start_button = st.button('Start Analysis')
109
 
110
+
111
  if start_button and df is not None:
112
+ # Drop rows with NaN or blank values in the review_column
113
  df = df[df[review_column].notna()]
114
  df = df[df[review_column].str.strip() != '']
115
+
116
+ class_names = [name.strip() for name in class_names.split(',')] # Split class names into a list
117
+ for name in class_names: # Add a new column for each class name
118
  if name not in df.columns:
119
  df[name] = 0.0
120
 
 
122
  with st.spinner('Performing sentiment analysis...'):
123
  df, df_display = process_reviews(df, review_column, class_names)
124
 
125
+ display_ratings(df, review_column) # updated this line
126
  display_dataframe(df, df_display)
127
  else:
128
+ st.write(f'No column named "{review_column}" found in the uploaded file.')
 
129
 
130
 
131
 
 
219
 
220
  st.dataframe(df_display)
221
 
222
+ def important_words(reviews, num_words=5):
223
+ if len(reviews) == 0:
224
+ return []
225
+ vectorizer = TfidfVectorizer(stop_words=stopwords_list, max_features=10000)
226
+ vectors = vectorizer.fit_transform(reviews)
227
+ features = vectorizer.get_feature_names_out()
228
+ indices = np.argsort(vectorizer.idf_)[::-1]
229
+ top_features = [features[i] for i in indices[:num_words]]
230
+ return top_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  def display_ratings(df, review_column):
233
  cols = st.columns(5)
234
 
235
  for i in range(1, 6):
236
  rating_reviews = df[df['Rating'] == i][review_column]
237
+ top_words = important_words(rating_reviews)
238
 
239
  rating_counts = rating_reviews.shape[0]
240
  cols[i-1].markdown(f"### {rating_counts}")
 
243
  # Display the most important words for each rating
244
  cols[i-1].markdown(f"#### Most Important Words:")
245
  if top_words:
246
+ for word in top_words:
247
  cols[i-1].markdown(f"**{word}**")
248
  else:
249
  cols[i-1].markdown("No important words to display")
250
 
251
 
 
252
 
253
 
254