Spaces:

fschwartzer
/

streamlit_chatbot

Running

App Files Files Community

fschwartzer commited on Sep 12, 2024

Commit

ede2957

verified ·

1 Parent(s): 91e5f2f

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -16

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 from transformers import pipeline
 import datetime
 from rapidfuzz import process, fuzz
 # Load the CSV file
 df = pd.read_csv("anomalies.csv", quotechar='"')
@@ -14,29 +15,39 @@ df['real'] = df['real'].apply(lambda x: f"{x:.2f}")
 # Fill NaN values and convert all columns to strings
 df = df.fillna('').astype(str)
-# Function to filter the DataFrame using RapidFuzz
-def filter_dataframe(df, date_str, group_keyword, threshold=80):
-    # Apply fuzzy matching on the 'ds' (date) and 'Group' columns
-    date_matches = process.extract(date_str, df['ds'], scorer=fuzz.token_sort_ratio, limit=None)
-    group_matches = process.extract(group_keyword, df['Group'], scorer=fuzz.token_sort_ratio, limit=None)
-    # Get the indices that match both criteria
-    date_indices = {match[2] for match in date_matches if match[1] >= threshold}
-    group_indices = {match[2] for match in group_matches if match[1] >= threshold}
-    common_indices = list(date_indices & group_indices)
-    return df.iloc[common_indices]
 # Function to generate a response using the TAPAS model
 def response(user_question, df):
     a = datetime.datetime.now()
-    # Extract date and group keywords from the user question
-    date_str = "December 2022"  # Example; you'd extract this from the user question dynamically
-    group_keyword = "IPVA"
-    # Filter the DataFrame by date and group
-    subset_df = filter_dataframe(df, date_str, group_keyword)
     # Check if the DataFrame is empty
     if subset_df.empty:

 from transformers import pipeline
 import datetime
 from rapidfuzz import process, fuzz
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
 # Load the CSV file
 df = pd.read_csv("anomalies.csv", quotechar='"')
 # Fill NaN values and convert all columns to strings
 df = df.fillna('').astype(str)
+# Filter 'real' higher than 10 Million
+df= df[df['real'] >= 1000000.]
+print(df)
+# Function to remove stopwords
+def remove_stopwords(text, stopwords=ENGLISH_STOP_WORDS):
+    return ' '.join([word for word in text.split() if word.lower() not in stopwords])
+# Function to filter DataFrame by checking if any of the user question words are in the columns
+def filter_dataframe(df, user_question, threshold=80):
+    user_question = remove_stopwords(user_question)  # Remove stopwords
+    question_words = user_question.split()
+    mask = pd.Series([False] * len(df))
+    for column in df.columns:
+        for word in question_words:
+            # Apply RapidFuzz fuzzy matching on the column
+            matches = process.extract(word, df[column], scorer=fuzz.token_sort_ratio, limit=None)
+            match_indices = [match[2] for match in matches if match[1] >= threshold]
+            mask.iloc[match_indices] = True
+    filtered_df = df[mask]
+    return filtered_df
 # Function to generate a response using the TAPAS model
 def response(user_question, df):
     a = datetime.datetime.now()
+    # Filter the DataFrame dynamically by user question
+    subset_df = filter_dataframe(df, user_question)
     # Check if the DataFrame is empty
     if subset_df.empty: