TomData commited on
Commit
d0fd192
·
1 Parent(s): a3f5633

Big layout update with some new functionalities

Browse files
Files changed (3) hide show
  1. Home.py +40 -25
  2. src/chatbot.py +59 -26
  3. src/vectordatabase.py +17 -43
Home.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
- from gradio_calendar import Calendar
4
- from datetime import datetime
5
 
6
  # Define important variables
7
  legislature_periods = [
@@ -34,21 +34,23 @@ partys = ['All','CDU/CSU','SPD','AfD','Grüne','FDP','DIE LINKE.','GB/BHE','DRP'
34
 
35
  with gr.Blocks() as App:
36
  with gr.Tab("ChatBot"):
37
- # Apply RAG using chatbut function from local file ChatBot.py
38
- db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="If empty all Legislaturperioden are selected", show_label=True)
39
- print(db_inputs)
40
-
41
-
42
- gr.ChatInterface(chatbot,
43
- title="PoliticsToYou",
44
- description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
45
- to get insight on the view points of the german parties and the debate of the parliament.",
46
- #examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
47
- cache_examples=False, #true increases the loading time
48
- additional_inputs = db_inputs,
49
- )
 
 
50
 
51
- with gr.Tab("KeyWordSearch"):
52
 
53
  with gr.Blocks() as Block:
54
  # Keyword Input
@@ -58,7 +60,7 @@ with gr.Blocks() as App:
58
  with gr.Accordion('Detailed filters', open=False):
59
  # Row orientation
60
  with gr.Row() as additional_input:
61
- n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
62
  party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
63
  # ToDo: Add date or legislature filter as input
64
  #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
@@ -111,17 +113,30 @@ with gr.Blocks() as App:
111
  )
112
 
113
  with gr.Tab("About"):
114
- gr.Markdown("""**Motivation:**
115
- The idea of this project is a combination of my curiosity in LLM application and my affection for speech data, that I developed during my bachelor thesis on measuring populism in text data.
116
- I would like to allow people to discover interesting discussions, opinions and positions that were communicated in the german parliament thoughout the years.
117
- **Development status:**
118
- Chatbot: Users can interact with the chatbot asking questions about anything that can be answered by speeches. Furthermore they can select any legislature as a basis for the chatbot's reply.
119
- Keyword
120
-
 
 
 
 
 
 
 
 
 
 
 
 
121
  """)
 
122
 
123
 
124
  if __name__ == "__main__":
125
- App.launch(share=False) #t rue not supported on hf spaces
126
 
127
 
 
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
+ #from gradio_calendar import Calendar
4
+ #from datetime import datetime
5
 
6
  # Define important variables
7
  legislature_periods = [
 
34
 
35
  with gr.Blocks() as App:
36
  with gr.Tab("ChatBot"):
37
+ with gr.Blocks():
38
+ # Apply RAG using chatbut function from local file ChatBot.py
39
+ db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="Legislature", info="Select a combination of legislatures as basis for the chatbot's replies", show_label=True)
40
+ prompt_language = gr.Dropdown(choices=["DE", "EN"], value="DE",label="Language", info="Choose output language", multiselect=False)
41
+
42
+
43
+
44
+ gr.ChatInterface(chatbot,
45
+ title="PoliticsToYou",
46
+ description= "Ask anything about your favorite political topic from any legislature period",
47
+ examples=["Wie steht die CDU zur Cannabislegalisierung?", "Wie steht die FDP zur Rente?", "Was wird für die Rechte von LGBTQ getan?", "Sollen wir Waffen an die Ukraine liefern"],
48
+ cache_examples=False, #true increases loading time
49
+ additional_inputs = [db_inputs, prompt_language],
50
+ additional_inputs_accordion="Additional inputs"
51
+ )
52
 
53
+ with gr.Tab("KeywordSearch"):
54
 
55
  with gr.Blocks() as Block:
56
  # Keyword Input
 
60
  with gr.Accordion('Detailed filters', open=False):
61
  # Row orientation
62
  with gr.Row() as additional_input:
63
+ n_slider = gr.Slider(label="Number of Results",info="Other filters reduces the returned results", minimum=1, maximum=100, step=1, value=10)
64
  party_dopdown = gr.Dropdown(value='All', choices=partys, label='Party')
65
  # ToDo: Add date or legislature filter as input
66
  #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
 
113
  )
114
 
115
  with gr.Tab("About"):
116
+ gr.Markdown("""<h1>Welcome to <strong>Politics2you</strong> - your playground for investigating the heart of politics in Germany.</h1>
117
+
118
+ <p>Would you like to gain insights into political debates or reveal party positions on specific topics from any legislature?</p>
119
+ <ul>
120
+ <li>You can use the ChatBot to ask all your questions or search for related speech content in the Keyword Search section.</li>
121
+ </ul>
122
+ <p>Enjoy your journey! </p>
123
+ <p>Looking forward to your feedback! <a href="mailto:[email protected]">[email protected]</a></p>
124
+
125
+ <h2>Further improvements & Ideas:</h2>
126
+ <ul>
127
+ <li>Experiment with different LLMs and Templates</li>
128
+ <li>Include chat history in RAG</li>
129
+ <li>Add a date or legislature filter to KeywordSearch</li>
130
+ <li>Exclude short document splits when creating the vectorstore</li>
131
+ <li>Improve inference time</li>
132
+ <li>Add analytic tools for party manifestos</li>
133
+ <li>Expand the scope to different countries</li>
134
+ </ul>
135
  """)
136
+
137
 
138
 
139
  if __name__ == "__main__":
140
+ App.launch(share=False) # true not supported on hf spaces
141
 
142
 
src/chatbot.py CHANGED
@@ -2,21 +2,20 @@ from langchain_core.prompts import ChatPromptTemplate
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
-
6
  from src.vectordatabase import RAG, get_vectorstore
7
  import pandas as pd
8
- from dotenv import load_dotenv, find_dotenv
9
-
10
- #Load environmental variables from .env-file
11
- #load_dotenv(find_dotenv())
12
 
 
 
 
13
 
14
- embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
 
15
  llm = HuggingFaceHub(
16
- # Try different model here
17
  repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
18
  # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
19
- # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22 gb
20
  # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
21
  task="text-generation",
22
  model_kwargs={
@@ -25,10 +24,8 @@ llm = HuggingFaceHub(
25
  "temperature": 0.1,
26
  "repetition_penalty": 1.03,
27
  }
28
- #,huggingfacehub_api_token
29
-
30
  )
31
- # To Do: Experiment with different templates
32
  prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
33
  Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
34
 
@@ -48,31 +45,67 @@ prompt_de = ChatPromptTemplate.from_template("""Beantworte die folgende Frage au
48
  """
49
  # Returns the answer in German
50
  )
51
- prompt_en = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
52
 
53
  <context>
54
  {context}
55
  </context>
56
 
57
- Frage: {input}
58
  """
59
- # Returns the answer in German
60
  )
61
 
62
-
63
- #folder_path =
64
- #index_name = "speeches_1949_09_12"
65
- #index_name = "legislature20"
66
- #db = get
67
 
68
- def chatbot(message, history, db_inputs, llm=llm, prompt=prompt_de):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
70
- raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
71
- # Only necessary because mistral does include it´s json structure in the output
72
- try:
73
- response = raw_response['answer'].split("Antwort: ")[1]
74
- except:
75
- response = raw_response['answer']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  return response
77
 
78
 
 
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
 
5
  from src.vectordatabase import RAG, get_vectorstore
6
  import pandas as pd
 
 
 
 
7
 
8
+ # Load environmental variables from .env-file
9
+ # from dotenv import load_dotenv, find_dotenv
10
+ # load_dotenv(find_dotenv())
11
 
12
+ # Define important variables
13
+ embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2") # Remove embedding input parameter from functions?
14
  llm = HuggingFaceHub(
15
+ # ToDo: Try different models here
16
  repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
17
  # repo_id="CohereForAI/c4ai-command-r-v01", # too large 69gb
18
+ # repo_id="CohereForAI/c4ai-command-r-v01-4bit", # too large 22gb
19
  # repo_id="meta-llama/Meta-Llama-3-8B", # too large 16 gb
20
  task="text-generation",
21
  model_kwargs={
 
24
  "temperature": 0.1,
25
  "repetition_penalty": 1.03,
26
  }
 
 
27
  )
28
+ # ToDo: Experiment with different templates
29
  prompt_test = ChatPromptTemplate.from_template("""<s>[INST]
30
  Instruction: Beantworte die folgende Frage auf deutsch und nur auf der Grundlage des angegebenen Kontexts:
31
 
 
45
  """
46
  # Returns the answer in German
47
  )
48
+ prompt_en = ChatPromptTemplate.from_template("""Answer the following question in English and solely based on the provided context:
49
 
50
  <context>
51
  {context}
52
  </context>
53
 
54
+ Question: {input}
55
  """
56
+ # Returns the answer in English
57
  )
58
 
 
 
 
 
 
59
 
60
+
61
+ def chatbot(message, history, db_inputs, prompt_language, llm=llm):
62
+ """
63
+ Generate a response from the chatbot based on the provided message, history, database inputs, prompt language, and LLM model.
64
+
65
+ Parameters:
66
+ -----------
67
+ message : str
68
+ The message or question to be answered by the chatbot.
69
+
70
+ history : list
71
+ The history of previous interactions or messages.
72
+
73
+ db_inputs : list
74
+ A list of strings specifying which vector stores to combine. Each string represents a specific index or a special keyword "All".
75
+
76
+ prompt_language : str
77
+ The language of the prompt to be used for generating the response. Should be either "DE" for German or "EN" for English.
78
+
79
+ llm : LLM, optional
80
+ An instance of the Language Model to be used for generating the response. Defaults to the global variable `llm`.
81
+
82
+ Returns:
83
+ --------
84
+ str
85
+ The response generated by the chatbot.
86
+ """
87
+
88
  db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
89
+
90
+ # Select prompt based on user input
91
+ if prompt_language == "DE":
92
+ prompt = prompt_de
93
+ raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
94
+ # Only necessary because mistral does include it´s json structure in the output including its input content
95
+ try:
96
+ response = raw_response['answer'].split("Antwort: ")[1]
97
+ except:
98
+ response = raw_response['answer']
99
+ return response
100
+ else:
101
+ prompt = prompt_en
102
+ raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
103
+ # Only necessary because mistral does include it´s json structure in the output including its input content
104
+ try:
105
+ response = raw_response['answer'].split("Answer: ")[1]
106
+ except:
107
+ response = raw_response['answer']
108
+
109
  return response
110
 
111
 
src/vectordatabase.py CHANGED
@@ -1,14 +1,14 @@
1
  from langchain_community.document_loaders import DataFrameLoader
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
3
- from langchain_core.prompts import ChatPromptTemplate
4
  from langchain_community.vectorstores import FAISS
5
- from langchain_community.llms import HuggingFaceHub
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.chains.combine_documents import create_stuff_documents_chain
8
  from langchain.chains import create_retrieval_chain
9
- from faiss import IndexFlatL2
10
  from langchain_community.docstore.in_memory import InMemoryDocstore
11
- from langchain.embeddings import SentenceTransformerEmbeddings
 
12
  #import functools
13
 
14
  import pandas as pd
@@ -62,45 +62,33 @@ def get_vectorstore(inputs, embeddings):
62
  """
63
  Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
64
 
65
- Parameters:
66
  ----------
67
  inputs : list of str
68
  A list of strings specifying which vector stores to combine. Each string represents a specific
69
- index or a special keyword "All". If "All" is included in the list, it will load a pre-defined
70
- comprehensive vector store and return immediately.
71
 
72
  embeddings : Embeddings
73
  An instance of embeddings that will be used to load the vector stores. The specific type and
74
  structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
75
 
76
- Returns:
77
  -------
78
  FAISS
79
  A FAISS vector store that combines the specified indices into a single vector store.
80
 
81
- Notes:
82
- -----
83
- - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.
84
- - The function initializes an empty FAISS vector store with a dimensionality of 128.
85
- - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".
86
- - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.
87
- - The `FAISS.load_local` method is used to load vector stores from the local file system.
88
- The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.
89
  """
90
 
91
  # Default folder path
92
  folder_path = "./src/FAISS"
93
 
94
- if inputs[0] == "All":
95
- # index_name = "speeches_1949_09_12"
96
- # db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
97
- # embeddings=embeddings, allow_dangerous_deserialization=True)
98
  return db_all
99
-
100
 
101
  # Initialize empty db
102
- embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
103
- dimensions: int = len(embedding_function.embed_query("dummy"))
104
 
105
  db = FAISS(
106
  embedding_function=embedding_function,
@@ -112,16 +100,21 @@ def get_vectorstore(inputs, embeddings):
112
 
113
  # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
114
  for input in inputs:
 
 
 
115
  # Retrieve selected index and merge vector stores
116
  index = input.split(".")[0]
117
  index_name = f'{index}_legislature'
118
  local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
119
- embeddings=embeddings, allow_dangerous_deserialization=True)
120
  db.merge_from(local_db)
 
121
  return db
122
 
123
 
124
 
 
125
  def RAG(llm, prompt, db, question):
126
  """
127
  Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
@@ -157,22 +150,3 @@ def RAG(llm, prompt, db, question):
157
 
158
  return response
159
 
160
-
161
- #########
162
- # Dynamically loading vector_db
163
- ##########
164
-
165
- def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):
166
-
167
- # Get all file names
168
- vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]
169
-
170
- df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
171
- # Extract metadata of file from its name
172
- for file_name in vector_stores:
173
- file_name = file_name.split(".")[0]
174
- file_elements = file_name.split("_")
175
- file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]
176
-
177
- if file_party == party and file_start_date <= start_date:
178
- None
 
1
  from langchain_community.document_loaders import DataFrameLoader
2
  from langchain_community.embeddings import HuggingFaceEmbeddings
 
3
  from langchain_community.vectorstores import FAISS
4
+
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.chains.combine_documents import create_stuff_documents_chain
7
  from langchain.chains import create_retrieval_chain
8
+
9
  from langchain_community.docstore.in_memory import InMemoryDocstore
10
+ from faiss import IndexFlatL2
11
+
12
  #import functools
13
 
14
  import pandas as pd
 
62
  """
63
  Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
64
 
65
+ Parameters
66
  ----------
67
  inputs : list of str
68
  A list of strings specifying which vector stores to combine. Each string represents a specific
69
+ index or a special keyword "All". If "All" is the first entry in the list,
70
+ it directly return the pre-defined vectorstore for all speeches
71
 
72
  embeddings : Embeddings
73
  An instance of embeddings that will be used to load the vector stores. The specific type and
74
  structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
75
 
76
+ Returns
77
  -------
78
  FAISS
79
  A FAISS vector store that combines the specified indices into a single vector store.
80
 
 
 
 
 
 
 
 
 
81
  """
82
 
83
  # Default folder path
84
  folder_path = "./src/FAISS"
85
 
86
+ if inputs[0] == "All" or inputs[0] is None:
 
 
 
87
  return db_all
 
88
 
89
  # Initialize empty db
90
+ embedding_function = embeddings
91
+ dimensions = len(embedding_function.embed_query("dummy"))
92
 
93
  db = FAISS(
94
  embedding_function=embedding_function,
 
100
 
101
  # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
102
  for input in inputs:
103
+ # Ignore if user also selected All among other legislatures
104
+ if input == "All":
105
+ continue
106
  # Retrieve selected index and merge vector stores
107
  index = input.split(".")[0]
108
  index_name = f'{index}_legislature'
109
  local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
110
+ embeddings=embeddings, allow_dangerous_deserialization=True)
111
  db.merge_from(local_db)
112
+ print('Successfully merged inputs')
113
  return db
114
 
115
 
116
 
117
+
118
  def RAG(llm, prompt, db, question):
119
  """
120
  Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
 
150
 
151
  return response
152