Spaces:

TomData
/

PoliticsToYou

Runtime error

App Files Files Community

TomData commited on Jun 3, 2024

Commit

d0fd192

1 Parent(s): a3f5633

Big layout update with some new functionalities

Browse files

Files changed (3) hide show

Home.py +40 -25
src/chatbot.py +59 -26
src/vectordatabase.py +17 -43

Home.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from src.chatbot import chatbot, keyword_search
-from gradio_calendar import Calendar
-from datetime import datetime
 # Define important variables
 legislature_periods = [
@@ -34,21 +34,23 @@ partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP'
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):
-        # Apply RAG using chatbut function from local file ChatBot.py
-        db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="If empty all Legislaturperioden are selected", show_label=True)
-        print(db_inputs)
-        gr.ChatInterface(chatbot,
-                    title="PoliticsToYou",
-                    description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
-                        to get insight on the view points of the german parties and the debate of the parliament.",
-                    #examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
-                    cache_examples=False,  #true increases the loading time
-                    additional_inputs = db_inputs,
-                    )
-    with gr.Tab("KeyWordSearch"):
         with gr.Blocks() as Block:
             # Keyword Input
@@ -58,7 +60,7 @@ with gr.Blocks() as App:
             with gr.Accordion('Detailed filters', open=False):
                 # Row orientation
                 with gr.Row() as additional_input:
-                    n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
                     party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
                     # ToDo: Add date or legislature filter as input
                     #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
@@ -111,17 +113,30 @@ with gr.Blocks() as App:
             )
     with gr.Tab("About"):
-        gr.Markdown("""**Motivation:**
-                    The idea of this project is a combination of my curiosity in LLM application and my affection for speech data, that I developed during my bachelor thesis on measuring populism in text data.
-                    I would like to allow people to discover interesting discussions, opinions and positions that were communicated in the german parliament thoughout the years.
-                    **Development status:**
-                    Chatbot: Users can interact with the chatbot asking questions about anything that can be answered by speeches. Furthermore they can select any legislature as a basis for the chatbot's reply.
-                    Keyword
                     """)
 if __name__ == "__main__":
-    App.launch(share=False) #t rue not supported on hf spaces

 import gradio as gr
 from src.chatbot import chatbot, keyword_search
+#from gradio_calendar import Calendar
+#from datetime import datetime
 # Define important variables
 legislature_periods = [
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):
+        with gr.Blocks():
+            # Apply RAG using chatbut function from local file ChatBot.py
+            db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="Legislature", info="Select a combination of legislatures as basis for the chatbot's replies", show_label=True)
+            prompt_language = gr.Dropdown(choices=["DE", "EN"], value="DE",label="Language", info="Choose output language", multiselect=False)
+            gr.ChatInterface(chatbot,
+                        title="PoliticsToYou",
+                        description= "Ask anything about your favorite political topic from any legislature period",
+                        examples=["Wie steht die CDU zur Cannabislegalisierung?", "Wie steht die FDP zur Rente?", "Was wird für die Rechte von LGBTQ getan?", "Sollen wir Waffen an die Ukraine liefern"],
+                        cache_examples=False,  #true increases loading time
+                        additional_inputs = [db_inputs, prompt_language],
+                        additional_inputs_accordion="Additional inputs"
+                        )
+    with gr.Tab("KeywordSearch"):
         with gr.Blocks() as Block:
             # Keyword Input
             with gr.Accordion('Detailed filters', open=False):
                 # Row orientation
                 with gr.Row() as additional_input:
+                    n_slider = gr.Slider(label="Number of Results",info="Other filters reduces the returned results", minimum=1, maximum=100, step=1, value=10)
                     party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
                     # ToDo: Add date or legislature filter as input
                     #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
             )
     with gr.Tab("About"):
+        gr.Markdown("""<h1>Welcome to <strong>Politics2you</strong> - your playground for investigating the heart of politics in Germany.</h1>
+                    <p>Would you like to gain insights into political debates or reveal party positions on specific topics from any legislature?</p>
+                    <ul>
+                    <li>You can use the ChatBot to ask all your questions or search for related speech content in the Keyword Search section.</li>
+                    </ul>
+                    <p>Enjoy your journey! </p>
+                    <p>Looking forward to your feedback! <a href="mailto:[email protected]">[email protected]</a></p>
+                    <h2>Further improvements & Ideas:</h2>
+                    <ul>
+                    <li>Experiment with different LLMs and Templates</li>
+                    <li>Include chat history in RAG</li>
+                    <li>Add a date or legislature filter to KeywordSearch</li>
+                    <li>Exclude short document splits when creating the vectorstore</li>
+                    <li>Improve inference time</li>
+                    <li>Add analytic tools for party manifestos</li>
+                    <li>Expand the scope to different countries</li>
+                    </ul>
                     """)
 if __name__ == "__main__":
+    App.launch(share=False) # true not supported on hf spaces

src/chatbot.py CHANGED Viewed

@@ -2,21 +2,20 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from src.vectordatabase import RAG, get_vectorstore
 import pandas as pd
-from dotenv import load_dotenv, find_dotenv
-#Load environmental variables from .env-file
-#load_dotenv(find_dotenv())
-embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
 llm = HuggingFaceHub(
-    # Try different model here
     repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
     # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
-    # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22 gb
     # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
     task="text-generation",
     model_kwargs={
@@ -25,10 +24,8 @@ llm = HuggingFaceHub(
         "temperature": 0.1,
         "repetition_penalty": 1.03,
         }
-        #,huggingfacehub_api_token
 )
-# To Do: Experiment with different templates
 prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
                     Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
@@ -48,31 +45,67 @@ prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage au
         """
         # Returns the answer in German
 )
-prompt_en = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
         <context>
         {context}
         </context>
-        Frage: {input}
         """
-        # Returns the answer in German
 )
-#folder_path =
-#index_name = "speeches_1949_09_12"
-#index_name = "legislature20"
-#db = get
-def chatbot(message, history, db_inputs, llm=llm, prompt=prompt_de):
     db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
-    raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
-    # Only necessary because mistral does include it´s json structure in the output
-    try:
-        response = raw_response['answer'].split("Antwort: ")[1]
-    except:
-        response = raw_response['answer']
     return response

 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from src.vectordatabase import RAG, get_vectorstore
 import pandas as pd
+# Load environmental variables from .env-file
+# from dotenv import load_dotenv, find_dotenv
+# load_dotenv(find_dotenv())
+# Define important variables
+embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
 llm = HuggingFaceHub(
+    # ToDo: Try different models here
     repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
     # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
+    # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22gb
     # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
     task="text-generation",
     model_kwargs={
         "temperature": 0.1,
         "repetition_penalty": 1.03,
         }
 )
+# ToDo: Experiment with different templates
 prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
                     Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
         """
         # Returns the answer in German
 )
+prompt_en = ChatPromptTemplate.from_template("""Answer the following question in English and solely based on the provided context:
         <context>
         {context}
         </context>
+        Question: {input}
         """
+        # Returns the answer in English
 )
+def chatbot(message, history, db_inputs, prompt_language, llm=llm):
+    """
+    Generate a response from the chatbot based on the provided message, history, database inputs, prompt language, and LLM model.
+    Parameters:
+    -----------
+    message : str
+        The message or question to be answered by the chatbot.
+    history : list
+        The history of previous interactions or messages.
+    db_inputs : list
+        A list of strings specifying which vector stores to combine. Each string represents a specific index or a special keyword "All".
+    prompt_language : str
+        The language of the prompt to be used for generating the response. Should be either "DE" for German or "EN" for English.
+    llm : LLM, optional
+        An instance of the Language Model to be used for generating the response. Defaults to the global variable `llm`.
+    Returns:
+    --------
+    str
+        The response generated by the chatbot.
+    """
     db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
+    # Select prompt based on user input
+    if prompt_language == "DE":
+        prompt = prompt_de
+        raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
+        # Only necessary because mistral does include it´s json structure in the output including its input content
+        try:
+            response = raw_response['answer'].split("Antwort: ")[1]
+        except:
+            response = raw_response['answer']
+        return response
+    else:
+        prompt = prompt_en
+        raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
+        # Only necessary because mistral does include it´s json structure in the output including its input content
+        try:
+            response = raw_response['answer'].split("Answer: ")[1]
+        except:
+            response = raw_response['answer']
     return response

src/vectordatabase.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from langchain_community.document_loaders import DataFrameLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.vectorstores import FAISS
-from langchain_community.llms import HuggingFaceHub
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
-from faiss import IndexFlatL2
 from langchain_community.docstore.in_memory import InMemoryDocstore
-from langchain.embeddings import SentenceTransformerEmbeddings
 #import functools
 import pandas as pd
@@ -62,45 +62,33 @@ def get_vectorstore(inputs, embeddings):
     """
     Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
-    Parameters:
     ----------
     inputs : list of str
         A list of strings specifying which vector stores to combine. Each string represents a specific
-        index or a special keyword "All". If "All" is included in the list, it will load a pre-defined
-        comprehensive vector store and return immediately.
     embeddings : Embeddings
         An instance of embeddings that will be used to load the vector stores. The specific type and
         structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
-    Returns:
     -------
     FAISS
         A FAISS vector store that combines the specified indices into a single vector store.
-    Notes:
-    -----
-    - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.
-    - The function initializes an empty FAISS vector store with a dimensionality of 128.
-    - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".
-    - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.
-    - The `FAISS.load_local` method is used to load vector stores from the local file system.
-      The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.
     """
     # Default folder path
     folder_path = "./src/FAISS"
-    if inputs[0] == "All":
-        # index_name = "speeches_1949_09_12"
-        # db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
-        #                                     embeddings=embeddings, allow_dangerous_deserialization=True)
         return db_all
     # Initialize empty db
-    embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    dimensions: int = len(embedding_function.embed_query("dummy"))
     db = FAISS(
         embedding_function=embedding_function,
@@ -112,16 +100,21 @@ def get_vectorstore(inputs, embeddings):
     # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
     for input in inputs:
         # Retrieve selected index and merge vector stores
         index = input.split(".")[0]
         index_name = f'{index}_legislature'
         local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
-                                            embeddings=embeddings, allow_dangerous_deserialization=True)
         db.merge_from(local_db)
     return db
 def RAG(llm, prompt, db, question):
     """
     Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
@@ -157,22 +150,3 @@ def RAG(llm, prompt, db, question):
     return response
-#########
-# Dynamically loading vector_db
-##########
-def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):
-    # Get all file names
-    vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]
-    df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
-    # Extract metadata of file from its name
-    for file_name in vector_stores:
-        file_name = file_name.split(".")[0]
-        file_elements = file_name.split("_")
-        file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]
-        if file_party == party and file_start_date <= start_date:
-            None

 from langchain_community.document_loaders import DataFrameLoader
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
 from langchain_community.docstore.in_memory import InMemoryDocstore
+from faiss import IndexFlatL2
 #import functools
 import pandas as pd
     """
     Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
+    Parameters
     ----------
     inputs : list of str
         A list of strings specifying which vector stores to combine. Each string represents a specific
+        index or a special keyword "All". If "All" is the first entry in the list,
+        it directly return the pre-defined vectorstore for all speeches
     embeddings : Embeddings
         An instance of embeddings that will be used to load the vector stores. The specific type and
         structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
+    Returns
     -------
     FAISS
         A FAISS vector store that combines the specified indices into a single vector store.
     """
     # Default folder path
     folder_path = "./src/FAISS"
+    if inputs[0] == "All" or inputs[0] is None:
         return db_all
     # Initialize empty db
+    embedding_function = embeddings
+    dimensions = len(embedding_function.embed_query("dummy"))
     db = FAISS(
         embedding_function=embedding_function,
     # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
     for input in inputs:
+        # Ignore if user also selected All among other legislatures
+        if input == "All":
+            continue
         # Retrieve selected index and merge vector stores
         index = input.split(".")[0]
         index_name = f'{index}_legislature'
         local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
+                                    embeddings=embeddings, allow_dangerous_deserialization=True)
         db.merge_from(local_db)
+        print('Successfully merged inputs')
     return db
 def RAG(llm, prompt, db, question):
     """
     Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
     return response