Spaces:

BulatF
/

StreamlitSentiment

Runtime error

App Files Files Community

BulatF commited on Jul 6, 2023

Commit

6a00b4d

1 Parent(s): 732f6c3

Upload app.py

Browse files

Files changed (1) hide show

app.py +36 -54

app.py CHANGED Viewed

@@ -6,12 +6,11 @@ from fuzzywuzzy import fuzz
 from sklearn.feature_extraction.text import TfidfVectorizer
 import torch.nn.functional as F
 import torch
-import os
 import io
 import base64
 from stqdm import stqdm
 import nltk
-import tempfile
 from nltk.corpus import stopwords
 nltk.download('stopwords')
 import matplotlib.pyplot as plt
@@ -31,19 +30,12 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
 #defs
-def classify_reviews(reviews, batch_size=64):
-    probabilities = []
-    for i in range(0, len(reviews), batch_size):
-        batch_reviews = reviews[i:i+batch_size]
-        inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
-        outputs = model(**inputs)
-        batch_probabilities = F.softmax(outputs.logits, dim=1).tolist()
-        probabilities.extend(batch_probabilities)
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
@@ -70,17 +62,15 @@ def process_filter_words(filter_words_input):
 # Function for classifying with the new model
-def classify_with_new_classes(reviews, class_names, batch_size=64):
     class_scores = []
-    for i in range(0, len(reviews), batch_size):
-        batch_reviews = reviews[i:i+batch_size]
-        for review in batch_reviews:
-            result = classifier(review, class_names)
-            scores_dict = dict(zip(result['labels'], result['scores']))
-            # Reorder scores to match the original class_names order
-            scores = [scores_dict[name] for name in class_names]
-            class_scores.append(scores)
     return class_scores
@@ -90,43 +80,36 @@ def main():
     st.title('Sentiment Analysis')
     st.markdown('Upload an Excel file to get sentiment analytics')
-    uploaded_file = st.file_uploader("Upload an excel file", type=['xlsx'], key='unique1')
     review_column = None
     df = None
     class_names = None  # New variable for class names
-    if uploaded_file is not None:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
-            tmp.write(uploaded_file.getvalue())
-            tmp_file_name = tmp.name
         try:
-            # Reading Excel file in chunks
-            chunk_size = 1000  # you can adjust this value according to your memory
-            chunks = []
-            for chunk in pd.read_excel(tmp_file_name, chunksize=chunk_size):
-                chunk = chunk.dropna(how='all')
-                chunk = chunk.replace(r'^\s*$', np.nan, regex=True)
-                chunk = chunk.dropna(how='all')
-                review_column = st.selectbox('Select the column from your excel file containing text', chunk.columns)
-                chunk[review_column] = chunk[review_column].astype(str)
-                filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)', key='filter words')  # New input field for filter words
-                filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)  # Process the filter words
-                class_names = st.text_input('Enter the possible class names separated by comma', key='class names')  # New input field for class names
-                chunk = filter_dataframe(chunk, review_column, filter_words)  # Filter the DataFrame
-                chunks.append(chunk)
-            df = pd.concat(chunks, ignore_index=True)
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
-    start_button = st.button('Start Analysis', key='unique2')
     if start_button and df is not None:
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
@@ -150,27 +133,24 @@ def main():
-def process_reviews(df, review_column, class_names, batch_size=64):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)
         total_reviews = len(df[review_column].tolist())
         review_counter = 0
         raw_scores = []
         reviews = df[review_column].tolist()
         for i in range(0, len(reviews), batch_size):
             batch_reviews = reviews[i:i+batch_size]
-            batch_scores = classify_reviews(batch_reviews, batch_size)
             raw_scores.extend(batch_scores)
             review_counter += len(batch_reviews)
             progress_bar.progress(review_counter / total_reviews)
     with st.spinner('Generating classes...'):
-        class_scores = []
-        for i in range(0, len(reviews), batch_size):
-            batch_reviews = reviews[i:i+batch_size]
-            batch_scores = classify_with_new_classes(batch_reviews, class_names, batch_size)
-            class_scores.extend(batch_scores)
     class_scores_dict = {}  # New dictionary to store class scores
     for i, name in enumerate(class_names):
@@ -181,6 +161,7 @@ def process_reviews(df, review_column, class_names, batch_size=64):
     if class_names and not all(name.isspace() for name in class_names):
         df['Highest Class'] = df[class_names].idxmax(axis=1)
     df_new = df.copy()
     df_new['raw_scores'] = raw_scores
     scores_to_df(df_new)
@@ -200,7 +181,6 @@ def process_reviews(df, review_column, class_names, batch_size=64):
 def scores_to_df(df):
     for i in range(1, 6):
         df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
@@ -269,6 +249,8 @@ def display_ratings(df, review_column):
             cols[i-1].markdown("No important words to display")
 if __name__ == "__main__":
     main()

 from sklearn.feature_extraction.text import TfidfVectorizer
 import torch.nn.functional as F
 import torch
 import io
 import base64
 from stqdm import stqdm
 import nltk
 from nltk.corpus import stopwords
 nltk.download('stopwords')
 import matplotlib.pyplot as plt
 #defs
+def classify_reviews(reviews):
+    inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
+    outputs = model(**inputs)
+    probabilities = F.softmax(outputs.logits, dim=1).tolist()
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
 # Function for classifying with the new model
+def classify_with_new_classes(reviews, class_names):
     class_scores = []
+    for review in reviews:
+        result = classifier(review, class_names)
+        scores_dict = dict(zip(result['labels'], result['scores']))
+        # Reorder scores to match the original class_names order
+        scores = [scores_dict[name] for name in class_names]
+        class_scores.append(scores)
     return class_scores
     st.title('Sentiment Analysis')
     st.markdown('Upload an Excel file to get sentiment analytics')
+    file = st.file_uploader("Upload an excel file", type=['xlsx'])
     review_column = None
     df = None
     class_names = None  # New variable for class names
+    if file is not None:
         try:
+            df = pd.read_excel(file)
+            # Drop rows where all columns are NaN
+            df = df.dropna(how='all')
+            # Replace blank spaces with NaN, then drop rows where all columns are NaN again
+            df = df.replace(r'^\s*$', np.nan, regex=True)
+            df = df.dropna(how='all')
+            review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
+            df[review_column] = df[review_column].astype(str)
+            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')  # New input field for filter words
+            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)  # Process the filter words
+            class_names = st.text_input('Enter the possible class names separated by comma')  # New input field for class names
+            df = filter_dataframe(df, review_column, filter_words)  # Filter the DataFrame
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
+    start_button = st.button('Start Analysis')
     if start_button and df is not None:
+        # Drop rows with NaN or blank values in the review_column
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
+def process_reviews(df, review_column, class_names):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)
         total_reviews = len(df[review_column].tolist())
         review_counter = 0
+        batch_size = 50
         raw_scores = []
         reviews = df[review_column].tolist()
         for i in range(0, len(reviews), batch_size):
             batch_reviews = reviews[i:i+batch_size]
+            batch_scores = classify_reviews(batch_reviews)
             raw_scores.extend(batch_scores)
             review_counter += len(batch_reviews)
             progress_bar.progress(review_counter / total_reviews)
     with st.spinner('Generating classes...'):
+        class_scores = classify_with_new_classes(df[review_column].tolist(), class_names)
     class_scores_dict = {}  # New dictionary to store class scores
     for i, name in enumerate(class_names):
     if class_names and not all(name.isspace() for name in class_names):
         df['Highest Class'] = df[class_names].idxmax(axis=1)
     df_new = df.copy()
     df_new['raw_scores'] = raw_scores
     scores_to_df(df_new)
 def scores_to_df(df):
     for i in range(1, 6):
         df[f'{i} Star'] = df['raw_scores'].apply(lambda scores: scores[i-1]).round(2)
             cols[i-1].markdown("No important words to display")
 if __name__ == "__main__":
     main()