Spaces:

BulatF
/

StreamlitSentiment

Runtime error

App Files Files Community

BulatF commited on Jul 5, 2023

Commit

e94c73c

1 Parent(s): bb50616

Upload 2 files

Browse files

Files changed (2) hide show

app.py +60 -33
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -16,17 +16,29 @@ nltk.download('stopwords')
 import matplotlib.pyplot as plt
 import numpy as np
-stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
-# Define the model and tokenizer
-model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
-model = AutoModelForSequenceClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 st.set_page_config(layout="wide")
-# Import the new model and tokenizer
-classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 #defs
@@ -83,38 +95,32 @@ def main():
     file = st.file_uploader("Upload an excel file", type=['xlsx'])
     review_column = None
     df = None
-    class_names = None  # New variable for class names
     if file is not None:
         try:
             df = pd.read_excel(file)
-            # Drop rows where all columns are NaN
             df = df.dropna(how='all')
-            # Replace blank spaces with NaN, then drop rows where all columns are NaN again
             df = df.replace(r'^\s*$', np.nan, regex=True)
             df = df.dropna(how='all')
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
-            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')  # New input field for filter words
-            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)  # Process the filter words
-            class_names = st.text_input('Enter the possible class names separated by comma')  # New input field for class names
-            df = filter_dataframe(df, review_column, filter_words)  # Filter the DataFrame
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
     start_button = st.button('Start Analysis')
     if start_button and df is not None:
-        # Drop rows with NaN or blank values in the review_column
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
-        class_names = [name.strip() for name in class_names.split(',')]  # Split class names into a list
-        for name in class_names:  # Add a new column for each class name
             if name not in df.columns:
                 df[name] = 0.0
@@ -122,10 +128,11 @@ def main():
             with st.spinner('Performing sentiment analysis...'):
                 df, df_display = process_reviews(df, review_column, class_names)
-            display_ratings(df, review_column)  # updated this line
             display_dataframe(df, df_display)
         else:
-            st.write(f'No column named "{review_column}" found in the uploaded file.')
@@ -219,22 +226,41 @@ def display_dataframe(df, df_display):
     st.dataframe(df_display)
-def important_words(reviews, num_words=5):
-    if len(reviews) == 0:
-        return []
-    vectorizer = TfidfVectorizer(stop_words=stopwords_list, max_features=10000)
-    vectors = vectorizer.fit_transform(reviews)
-    features = vectorizer.get_feature_names_out()
-    indices = np.argsort(vectorizer.idf_)[::-1]
-    top_features = [features[i] for i in indices[:num_words]]
-    return top_features
 def display_ratings(df, review_column):
     cols = st.columns(5)
     for i in range(1, 6):
         rating_reviews = df[df['Rating'] == i][review_column]
-        top_words = important_words(rating_reviews)
         rating_counts = rating_reviews.shape[0]
         cols[i-1].markdown(f"### {rating_counts}")
@@ -243,12 +269,13 @@ def display_ratings(df, review_column):
         # Display the most important words for each rating
         cols[i-1].markdown(f"#### Most Important Words:")
         if top_words:
-            for word in top_words:
                 cols[i-1].markdown(f"**{word}**")
         else:
             cols[i-1].markdown("No important words to display")

 import matplotlib.pyplot as plt
 import numpy as np
+from lime.lime_text import LimeTextExplainer
+from lime import lime_text
+stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 st.set_page_config(layout="wide")
+@st.cache_resource
+def load_model_and_tokenizer(model_name):
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return model, tokenizer
+model, tokenizer = load_model_and_tokenizer('nlptown/bert-base-multilingual-uncased-sentiment')
+@st.cache_resource
+def load_pipeline():
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    return classifier
+classifier = load_pipeline()
 #defs
     file = st.file_uploader("Upload an excel file", type=['xlsx'])
     review_column = None
     df = None
+    class_names = None
     if file is not None:
         try:
             df = pd.read_excel(file)
             df = df.dropna(how='all')
             df = df.replace(r'^\s*$', np.nan, regex=True)
             df = df.dropna(how='all')
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
+            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')
+            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)
+            class_names = st.text_input('Enter the possible class names separated by comma')
+            df = filter_dataframe(df, review_column, filter_words)
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
     start_button = st.button('Start Analysis')
     if start_button and df is not None:
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
+        class_names = [name.strip() for name in class_names.split(',')]
+        for name in class_names:
             if name not in df.columns:
                 df[name] = 0.0
             with st.spinner('Performing sentiment analysis...'):
                 df, df_display = process_reviews(df, review_column, class_names)
+            display_ratings(df, review_column)
             display_dataframe(df, df_display)
         else:
+            st.write("The selected review column doesn't exist in the dataframe")
     st.dataframe(df_display)
+def important_words(reviews, model, num_words=5):
+    # Create a LimeTextExplainer
+    explainer = LimeTextExplainer(class_names=[str(i) for i in range(1, 6)])
+    # Define a prediction function that takes a list of texts and outputs a prediction matrix
+    def predict_proba(texts):
+        inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
+        outputs = model(**inputs)
+        probabilities = F.softmax(outputs.logits, dim=1).detach().numpy()
+        return probabilities
+    important_words_per_rating = {}
+    for rating in range(1, 6):
+        important_words_per_rating[rating] = []
+        for review in reviews:
+            # Get the explanation for the review
+            explanation = explainer.explain_instance(review, predict_proba, num_features=num_words, labels=[rating - 1])
+            # Get the list of important words
+            words = [feature[0] for feature in explanation.as_list(rating - 1)]
+            important_words_per_rating[rating].extend(words)
+        # Keep only unique words
+        important_words_per_rating[rating] = list(set(important_words_per_rating[rating]))
+    return important_words_per_rating
 def display_ratings(df, review_column):
     cols = st.columns(5)
     for i in range(1, 6):
         rating_reviews = df[df['Rating'] == i][review_column]
+        top_words = important_words(rating_reviews, model)
         rating_counts = rating_reviews.shape[0]
         cols[i-1].markdown(f"### {rating_counts}")
         # Display the most important words for each rating
         cols[i-1].markdown(f"#### Most Important Words:")
         if top_words:
+            for word in top_words[i]:
                 cols[i-1].markdown(f"**{word}**")
         else:
             cols[i-1].markdown("No important words to display")

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ matplotlib
 fuzzywuzzy
 scikit-learn
 nltk
-numpy

 fuzzywuzzy
 scikit-learn
 nltk
+numpy
+lime