BulatF commited on
Commit
e94c73c
·
1 Parent(s): bb50616

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +60 -33
  2. requirements.txt +2 -1
app.py CHANGED
@@ -16,17 +16,29 @@ nltk.download('stopwords')
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
19
- stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 
 
20
 
21
- # Define the model and tokenizer
22
- model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
23
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
- tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  st.set_page_config(layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Import the new model and tokenizer
28
 
29
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
30
 
31
 
32
  #defs
@@ -83,38 +95,32 @@ def main():
83
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
84
  review_column = None
85
  df = None
86
- class_names = None # New variable for class names
87
 
88
  if file is not None:
89
  try:
90
  df = pd.read_excel(file)
91
- # Drop rows where all columns are NaN
92
  df = df.dropna(how='all')
93
- # Replace blank spaces with NaN, then drop rows where all columns are NaN again
94
  df = df.replace(r'^\s*$', np.nan, regex=True)
95
  df = df.dropna(how='all')
96
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
97
  df[review_column] = df[review_column].astype(str)
98
 
99
-
100
- filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)') # New input field for filter words
101
- filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
102
- class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names
103
- df = filter_dataframe(df, review_column, filter_words) # Filter the DataFrame
104
  except Exception as e:
105
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
106
  return
107
 
108
  start_button = st.button('Start Analysis')
109
 
110
-
111
  if start_button and df is not None:
112
- # Drop rows with NaN or blank values in the review_column
113
  df = df[df[review_column].notna()]
114
  df = df[df[review_column].str.strip() != '']
115
-
116
- class_names = [name.strip() for name in class_names.split(',')] # Split class names into a list
117
- for name in class_names: # Add a new column for each class name
118
  if name not in df.columns:
119
  df[name] = 0.0
120
 
@@ -122,10 +128,11 @@ def main():
122
  with st.spinner('Performing sentiment analysis...'):
123
  df, df_display = process_reviews(df, review_column, class_names)
124
 
125
- display_ratings(df, review_column) # updated this line
126
  display_dataframe(df, df_display)
127
  else:
128
- st.write(f'No column named "{review_column}" found in the uploaded file.')
 
129
 
130
 
131
 
@@ -219,22 +226,41 @@ def display_dataframe(df, df_display):
219
 
220
  st.dataframe(df_display)
221
 
222
- def important_words(reviews, num_words=5):
223
- if len(reviews) == 0:
224
- return []
225
- vectorizer = TfidfVectorizer(stop_words=stopwords_list, max_features=10000)
226
- vectors = vectorizer.fit_transform(reviews)
227
- features = vectorizer.get_feature_names_out()
228
- indices = np.argsort(vectorizer.idf_)[::-1]
229
- top_features = [features[i] for i in indices[:num_words]]
230
- return top_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  def display_ratings(df, review_column):
233
  cols = st.columns(5)
234
 
235
  for i in range(1, 6):
236
  rating_reviews = df[df['Rating'] == i][review_column]
237
- top_words = important_words(rating_reviews)
238
 
239
  rating_counts = rating_reviews.shape[0]
240
  cols[i-1].markdown(f"### {rating_counts}")
@@ -243,12 +269,13 @@ def display_ratings(df, review_column):
243
  # Display the most important words for each rating
244
  cols[i-1].markdown(f"#### Most Important Words:")
245
  if top_words:
246
- for word in top_words:
247
  cols[i-1].markdown(f"**{word}**")
248
  else:
249
  cols[i-1].markdown("No important words to display")
250
 
251
 
 
252
 
253
 
254
 
 
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
19
+ from lime.lime_text import LimeTextExplainer
20
+ from lime import lime_text
21
+
22
 
23
+ stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 
 
 
24
  st.set_page_config(layout="wide")
25
+ @st.cache_resource
26
+ def load_model_and_tokenizer(model_name):
27
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ return model, tokenizer
30
+
31
+ model, tokenizer = load_model_and_tokenizer('nlptown/bert-base-multilingual-uncased-sentiment')
32
+
33
+ @st.cache_resource
34
+ def load_pipeline():
35
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
36
+ return classifier
37
+
38
+ classifier = load_pipeline()
39
+
40
 
 
41
 
 
42
 
43
 
44
  #defs
 
95
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
96
  review_column = None
97
  df = None
98
+ class_names = None
99
 
100
  if file is not None:
101
  try:
102
  df = pd.read_excel(file)
 
103
  df = df.dropna(how='all')
 
104
  df = df.replace(r'^\s*$', np.nan, regex=True)
105
  df = df.dropna(how='all')
106
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
107
  df[review_column] = df[review_column].astype(str)
108
 
109
+ filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')
110
+ filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)
111
+ class_names = st.text_input('Enter the possible class names separated by comma')
112
+ df = filter_dataframe(df, review_column, filter_words)
 
113
  except Exception as e:
114
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
115
  return
116
 
117
  start_button = st.button('Start Analysis')
118
 
 
119
  if start_button and df is not None:
 
120
  df = df[df[review_column].notna()]
121
  df = df[df[review_column].str.strip() != '']
122
+ class_names = [name.strip() for name in class_names.split(',')]
123
+ for name in class_names:
 
124
  if name not in df.columns:
125
  df[name] = 0.0
126
 
 
128
  with st.spinner('Performing sentiment analysis...'):
129
  df, df_display = process_reviews(df, review_column, class_names)
130
 
131
+ display_ratings(df, review_column)
132
  display_dataframe(df, df_display)
133
  else:
134
+ st.write("The selected review column doesn't exist in the dataframe")
135
+
136
 
137
 
138
 
 
226
 
227
  st.dataframe(df_display)
228
 
229
+ def important_words(reviews, model, num_words=5):
230
+ # Create a LimeTextExplainer
231
+ explainer = LimeTextExplainer(class_names=[str(i) for i in range(1, 6)])
232
+
233
+ # Define a prediction function that takes a list of texts and outputs a prediction matrix
234
+ def predict_proba(texts):
235
+ inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
236
+ outputs = model(**inputs)
237
+ probabilities = F.softmax(outputs.logits, dim=1).detach().numpy()
238
+ return probabilities
239
+
240
+ important_words_per_rating = {}
241
+
242
+ for rating in range(1, 6):
243
+ important_words_per_rating[rating] = []
244
+ for review in reviews:
245
+ # Get the explanation for the review
246
+ explanation = explainer.explain_instance(review, predict_proba, num_features=num_words, labels=[rating - 1])
247
+
248
+ # Get the list of important words
249
+ words = [feature[0] for feature in explanation.as_list(rating - 1)]
250
+ important_words_per_rating[rating].extend(words)
251
+
252
+ # Keep only unique words
253
+ important_words_per_rating[rating] = list(set(important_words_per_rating[rating]))
254
+
255
+ return important_words_per_rating
256
+
257
 
258
  def display_ratings(df, review_column):
259
  cols = st.columns(5)
260
 
261
  for i in range(1, 6):
262
  rating_reviews = df[df['Rating'] == i][review_column]
263
+ top_words = important_words(rating_reviews, model)
264
 
265
  rating_counts = rating_reviews.shape[0]
266
  cols[i-1].markdown(f"### {rating_counts}")
 
269
  # Display the most important words for each rating
270
  cols[i-1].markdown(f"#### Most Important Words:")
271
  if top_words:
272
+ for word in top_words[i]:
273
  cols[i-1].markdown(f"**{word}**")
274
  else:
275
  cols[i-1].markdown("No important words to display")
276
 
277
 
278
+
279
 
280
 
281
 
requirements.txt CHANGED
@@ -9,4 +9,5 @@ matplotlib
9
  fuzzywuzzy
10
  scikit-learn
11
  nltk
12
- numpy
 
 
9
  fuzzywuzzy
10
  scikit-learn
11
  nltk
12
+ numpy
13
+ lime