Mishab commited on
Commit
519c3e7
1 Parent(s): a632928

Initial Push of small files

Browse files
Files changed (4) hide show
  1. app.py +244 -0
  2. opm_logo.png +0 -0
  3. requirements.txt +15 -0
  4. utils.py +279 -0
app.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pypdf import PdfReader
3
+ # import replicate
4
+ import os
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+ import pickle
8
+ import timeit
9
+ from PIL import Image
10
+ import datetime
11
+ import base64
12
+
13
+ from langchain.embeddings import HuggingFaceEmbeddings
14
+ from langchain.vectorstores import FAISS
15
+ from langchain.document_loaders import PyPDFLoader
16
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
18
+ from langchain.memory import ConversationBufferMemory
19
+ from langchain.chains import ConversationalRetrievalChain
20
+ from langchain.prompts.prompt import PromptTemplate
21
+ from langchain.llms import LlamaCpp
22
+ from langchain.callbacks.manager import CallbackManager
23
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
24
+ from langchain.vectorstores import Chroma
25
+ from langchain.document_loaders import PyPDFDirectoryLoader
26
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
27
+ from langchain.chat_models import ChatOpenAI
28
+ from langchain.agents.agent_toolkits import create_retriever_tool
29
+ from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
30
+ from langchain.utilities import SerpAPIWrapper
31
+
32
+ from utils import build_embedding_model, build_llm
33
+ from utils import load_ensemble_retriver, load_text_chunks, load_vectorstore, load_conversational_retrievel_agent
34
+
35
+ load_dotenv()
36
+ # Getting current timestamp to keep track of historical conversations
37
+ current_timestamp = datetime.datetime.now()
38
+ timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
39
+
40
+ #Directories path
41
+ persist_directory= "Database/PDF_HTML_CHROMA_DB"
42
+ all_docs_pkl_directory= 'Database/text_chunks_html_pdf.pkl'
43
+
44
+ # Initliazing sesstion states in Streamlit to cache different stuffs like model iniitialization and there by avoid re-running of alredy initialized stuffs over and again.
45
+ if "llm" not in st.session_state:
46
+ st.session_state["llm"] = build_llm()
47
+
48
+ if "embeddings" not in st.session_state:
49
+ st.session_state["embeddings"] = build_embedding_model()
50
+
51
+ if "vector_db" not in st.session_state:
52
+ st.session_state["vector_db"] = load_vectorstore(persist_directory=persist_directory, embeddings=st.session_state["embeddings"])
53
+
54
+ if "text_chunks" not in st.session_state:
55
+ st.session_state["text_chunks"] = load_text_chunks(text_chunks_pkl_dir=all_docs_pkl_directory)
56
+
57
+ if "ensemble_retriver" not in st.session_state:
58
+ st.session_state["ensemble_retriver"] = load_ensemble_retriver(text_chunks=st.session_state["text_chunks"], embeddings=st.session_state["embeddings"], chroma_vectorstore=st.session_state["vector_db"] )
59
+
60
+ if "agent_executor" not in st.session_state:
61
+ st.session_state["agent_executor"] = load_conversational_retrievel_agent(retriever=st.session_state["ensemble_retriver"], llm=st.session_state["llm"])
62
+
63
+
64
+
65
+ # App title
66
+ st.set_page_config(
67
+ page_title="OMP Search Bot",
68
+ layout="wide",
69
+ initial_sidebar_state="expanded",
70
+ )
71
+
72
+ st.markdown("""
73
+ <style>
74
+ .block-container {
75
+ padding-top: 2.2rem}
76
+ </style>
77
+ """, unsafe_allow_html=True)
78
+ # To get header in the App
79
+ col1, col2= st.columns(2)
80
+
81
+ title1 = """
82
+ <p style="font-size: 26px;text-align: right; color: #0C3453; font-weight: bold">OPM Retirement Services Assistant</p>
83
+ """
84
+
85
+ def clear_chat_history():
86
+ st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?"}]
87
+
88
+ file_ = open("opm_logo.png", "rb")
89
+ contents = file_.read()
90
+ data_url = base64.b64encode(contents).decode("utf-8")
91
+ file_.close()
92
+
93
+ st.markdown(
94
+ f"""
95
+ <div style="background-color: white; padding: 15px; border-radius: 10px;">
96
+ <div style="display: flex; justify-content: space-between;">
97
+ <div>
98
+ <img src="data:image/png;base64,{data_url}" style="max-width: 100%;" alt="OPM Logo" />
99
+ </div>
100
+ <div style="flex: 1; padding: 15px;">
101
+ {title1}
102
+ """,
103
+ unsafe_allow_html=True
104
+ )
105
+ st.write("")
106
+
107
+
108
+ st.write('<p style="color: #B0B0B0;margin: 0;">OPM is here to help you transition from serving the American people to enjoying your retirement. This retirement services assistant shows our commitment to supporting new and existing retirees throughout the retirement journey. Our assistant is trained on 1500+ documents related to OPM retirement services and can answer your questions in conversational style. Just ask away..</p>', unsafe_allow_html=True)
109
+
110
+ st.markdown("""---""")
111
+
112
+ text_html = """
113
+ <p style="font-size: 14px; text-align: center; color: #727477; margin: 0;">
114
+ Type your question in conversational style
115
+ </p>
116
+ <p style="font-size: 14px; text-align: center; color: #727477; margin: 0;">
117
+ Example: What are interim benefits?
118
+ </p>
119
+ """
120
+
121
+ st.write(text_html, unsafe_allow_html=True)
122
+
123
+
124
+ with st.sidebar:
125
+ st.subheader("")
126
+
127
+ if st.session_state["vector_db"] and st.session_state["llm"]:
128
+ # Store LLM generated responses
129
+ if "messages" not in st.session_state.keys():
130
+ st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?", "Source":""}]
131
+
132
+ # Display or clear chat messages
133
+ for message in st.session_state.messages:
134
+ with st.chat_message(message["role"]):
135
+ st.write(message["content"])
136
+ if message["Source"]=="":
137
+ st.write("")
138
+ else:
139
+ with st.expander("source"):
140
+ for idx, item in enumerate(message["Source"]):
141
+ st.markdown(item["Page"])
142
+ st.markdown(item["Source"])
143
+ st.markdown(item["page_content"])
144
+ st.write("---")
145
+
146
+
147
+ # Initialize the session state to store chat history
148
+ if "stored_session" not in st.session_state:
149
+ st.session_state["stored_session"] = []
150
+
151
+ # Create a list to store expanders
152
+ if "expanders" not in st.session_state:
153
+ st.session_state["expanders"] = []
154
+
155
+ # Define a function to add a new chat expander
156
+ def add_chat_expander(chat_history):
157
+ current_timestamp = datetime.datetime.now()
158
+ timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
159
+ st.session_state["expanders"].append({"timestamp": timestamp_string, "chat_history": chat_history})
160
+
161
+ def clear_chat_history():
162
+ """
163
+ To remove existing chat history and start new conversation
164
+ """
165
+ stored_session = []
166
+ for dict_message in st.session_state.messages:
167
+ if dict_message["role"] == "user":
168
+ string_dialogue = "User: " + dict_message["content"] + "\n\n"
169
+ st.session_state["stored_session"].append(string_dialogue)
170
+
171
+ else:
172
+ string_dialogue = "Assistant: " + dict_message["content"] + "\n\n"
173
+ st.session_state["stored_session"].append(string_dialogue)
174
+ stored_session.append(string_dialogue)
175
+
176
+ # Add a new chat expander
177
+ add_chat_expander(stored_session)
178
+ st.session_state.messages = [{"role": "assistant", "content": "How may I assist you today?", "Source":""}]
179
+
180
+ st.sidebar.button('New chat', on_click=clear_chat_history, use_container_width=True)
181
+ st.sidebar.text("")
182
+ st.sidebar.write('<p style="font-size: 16px;text-align: center; color: #727477; font-weight: bold">Chat history</p>', unsafe_allow_html=True)
183
+ # Display existing chat expanders
184
+ for expander_info in st.session_state["expanders"]:
185
+ with st.sidebar.expander("Conversation ended at:"+"\n\n"+expander_info["timestamp"]):
186
+ for message in expander_info["chat_history"]:
187
+ if message.startswith("User:"):
188
+ st.write(f'<span style="color: #EF6A6A;">{message}</span>', unsafe_allow_html=True)
189
+ elif message.startswith("Assistant:"):
190
+ st.write(f'<span style="color: #F7BD45;">{message}</span>', unsafe_allow_html=True)
191
+ else:
192
+ st.write(message)
193
+
194
+
195
+ def generate_llm_response(agent_executor, prompt_input):
196
+ result = agent_executor({"input": prompt_input})
197
+ return [result['output'], result['intermediate_steps']]
198
+
199
+
200
+ # User-provided prompt
201
+ if prompt := st.chat_input(disabled= not st.session_state["vector_db"]):
202
+ st.session_state.messages.append({"role": "user", "content": prompt, "Source":""})
203
+ with st.chat_message("user"):
204
+ st.write(prompt)
205
+
206
+ # Generate a new response if last message is not from assistant
207
+ if st.session_state.messages[-1]["role"] != "assistant":
208
+ with st.chat_message("assistant"):
209
+ with st.spinner("Searching..."):
210
+ start = timeit.default_timer()
211
+ response = generate_llm_response(agent_executor=st.session_state["agent_executor"], prompt_input=prompt)
212
+ placeholder = st.empty()
213
+ full_response = ''
214
+ for item in response[0]:
215
+ full_response += item
216
+ placeholder.markdown(full_response)
217
+ # The following logic will work in the way given below.
218
+ # -- Check if intermediary steps are present in the output of the given prompt.
219
+ # -- If not, we can conclude that, agent has used internet search as tool.
220
+ # -- Check if intermediary steps are present in the output of the prompt.
221
+ # -- If intermediary steps are present, it means agent has used exising custom knowledge base for iformation retrival and therefore we need to give souce docs as output along with LLM's reponse.
222
+ if len(response[1])>0:
223
+ st.text("-------------------------------------")
224
+ docs= st.session_state["ensemble_retriver"].get_relevant_documents(prompt)
225
+ source_doc_list= []
226
+ for doc in docs:
227
+ source_doc_list.append(doc.dict())
228
+ merged_source_doc= []
229
+ with st.expander("source"):
230
+ for idx, item in enumerate(source_doc_list):
231
+ source_doc = {"Page": f"Source {idx + 1}", "Source": f"**Source:** {item['metadata']['source'].split('/')[-1]}",
232
+ "page_content":item["page_content"]}
233
+ merged_source_doc.append(source_doc)
234
+ st.markdown(f"Source {idx + 1}")
235
+ st.markdown(f"**Source:** {item['metadata']['source'].split('/')[-1]}")
236
+ st.markdown(item["page_content"])
237
+ st.write("---") # Add a separator between entries
238
+ message = {"role": "assistant", "content": full_response, "Source":merged_source_doc}
239
+ st.session_state.messages.append(message)
240
+ else:
241
+ message = {"role": "assistant", "content": full_response, "Source":""}
242
+ st.session_state.messages.append(message)
243
+ end = timeit.default_timer()
244
+ print(f"Time to retrieve response: {end - start}")
opm_logo.png ADDED
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb==0.4.6
2
+ langchain==0.0.278
3
+ openai==0.27.8
4
+ numpy==1.25.2
5
+ pandas==2.0.3
6
+ Pillow==9.5.0
7
+ pypdf==3.15.1
8
+ PyPDF2==3.0.1
9
+ python-dotenv==1.0.0
10
+ sentence-transformers==2.2.2
11
+ streamlit==1.25.0
12
+ streamlit-chat==0.1.1
13
+ rank-bm25==0.2.2
14
+ google-search-results==2.4.2
15
+ tiktoken
utils.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pypdf import PdfReader
3
+ import os
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ import pickle
7
+ import timeit
8
+ from PIL import Image
9
+ import zipfile
10
+ import datetime
11
+ import shutil
12
+ from collections import defaultdict
13
+ import pandas as pd
14
+
15
+ from langchain.embeddings import HuggingFaceEmbeddings
16
+ from langchain.document_loaders import PyPDFLoader
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
19
+ from langchain.memory import ConversationBufferMemory
20
+ from langchain.chains import ConversationalRetrievalChain
21
+ from langchain.prompts.prompt import PromptTemplate
22
+ from langchain.vectorstores import Chroma
23
+ from langchain.document_loaders import PyPDFDirectoryLoader
24
+ from langchain.retrievers import BM25Retriever, EnsembleRetriever
25
+ from langchain.document_loaders import UnstructuredHTMLLoader
26
+ from langchain.llms import OpenAI
27
+ from langchain.chat_models import ChatOpenAI
28
+ from langchain.agents.agent_toolkits import create_retriever_tool
29
+ from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
30
+ from langchain.utilities import SerpAPIWrapper
31
+ from langchain.agents import Tool
32
+ from langchain.agents import load_tools
33
+
34
+ load_dotenv()
35
+
36
+
37
+ current_timestamp = datetime.datetime.now()
38
+ timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")
39
+
40
+
41
+ def build_llm():
42
+ '''
43
+ Loading OpenAI model
44
+ '''
45
+ # llm= OpenAI(temperature=0.2)
46
+ llm= ChatOpenAI(temperature = 0, max_tokens=256)
47
+ return llm
48
+
49
+ def build_embedding_model():
50
+ '''
51
+ Loading Sentence transformer model for text embedding
52
+ '''
53
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
54
+ model_kwargs={'device': 'cpu'})
55
+ return embeddings
56
+
57
+ def unzip_opm():
58
+ # Specify the path to your ZIP file
59
+ zip_file_path = r'OPM_Files/OPM_Retirement_backup-20230902T130906Z-001.zip'
60
+
61
+ # Get the directory where the ZIP file is located
62
+ extract_path = os.path.dirname(zip_file_path)
63
+
64
+ # Create a folder with the same name as the ZIP file (without the .zip extension)
65
+ extract_folder = os.path.splitext(os.path.basename(zip_file_path))[0]
66
+ extract_folder_path = os.path.join(extract_path, extract_folder)
67
+
68
+ # Create the folder if it doesn't exist
69
+ if not os.path.exists(extract_folder_path):
70
+ os.makedirs(extract_folder_path)
71
+
72
+ # Open the ZIP file for reading
73
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
74
+ # Extract all the contents into the created folder
75
+ zip_ref.extractall(extract_folder_path)
76
+
77
+ print(f'Unzipped {zip_file_path} to {extract_folder_path}')
78
+ return extract_folder_path
79
+
80
+
81
+
82
+
83
+
84
+ return
85
+
86
+ def count_files_by_type(folder_path):
87
+ '''
88
+ Counting files by file type in the specified folder
89
+ '''
90
+ file_count_by_type = defaultdict(int)
91
+
92
+ for root, _, files in os.walk(folder_path):
93
+ for file in files:
94
+ _, extension = os.path.splitext(file)
95
+ file_count_by_type[extension] += 1
96
+
97
+ return file_count_by_type
98
+
99
+ def generate_file_count_table(file_count_by_type):
100
+ '''
101
+ Generate a table files count file type
102
+ '''
103
+ data = {"File Type": [], "Number of Files": []}
104
+ for extension, count in file_count_by_type.items():
105
+ data["File Type"].append(extension)
106
+ data["Number of Files"].append(count)
107
+
108
+ df = pd.DataFrame(data)
109
+ df = df.sort_values(by="Number of Files", ascending=False) # Sort by number of files
110
+ return df
111
+
112
+ def move_files_to_folders(folder_path):
113
+ '''
114
+ Move files to respective folder. Example, PDF docs to PDFs folder, HTML docs to HTMLs folder.
115
+ '''
116
+ for root, _, files in os.walk(folder_path):
117
+ for file in files:
118
+ _, extension = os.path.splitext(file)
119
+ source_path = os.path.join(root, file)
120
+
121
+ if extension == '.pdf':
122
+ dest_folder = "PDFs"
123
+ elif extension == '.html':
124
+ dest_folder = "HTMLs"
125
+ else:
126
+ continue
127
+
128
+ dest_path = os.path.join(dest_folder, file)
129
+ os.makedirs(dest_folder, exist_ok=True)
130
+ shutil.copy(source_path, dest_path)
131
+
132
+
133
+
134
+ def load_vectorstore(persist_directory, embeddings):
135
+ '''
136
+ This function will try first to load chroma database from the disk. If it does exist,
137
+ It will do the following,
138
+ 1) Load the pdfs
139
+ 2) create text chunks
140
+ 3) Index it and store it in a Chroma DB
141
+ 4) Peform the same for HTML files
142
+ 5) Store the final chroma db in the disk
143
+ '''
144
+ if os.path.exists(persist_directory):
145
+ print("Using existing vectore store for these documents.")
146
+ vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
147
+ print("Chroma DB loaded from the disk")
148
+ return vectorstore
149
+ else:
150
+ folder_path= unzip_opm()
151
+ print("Vector store is not available. Creating new one.")
152
+ file_count_by_type = count_files_by_type(folder_path)
153
+ file_count_table = generate_file_count_table(file_count_by_type)
154
+ print("File Count Table:")
155
+ print(file_count_table)
156
+ #move files into respective folders
157
+ move_files_to_folders(folder_path)
158
+ print("PDF and HTML files copied to separate folders.")
159
+
160
+ # Load the pdf files from the pdffolder in order to create new chroma db
161
+ pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
162
+ html_folder_path= f"{folder_path}/HTMLs" #html folder
163
+ pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
164
+ pdf_pages = pdf_dir_loader.load()
165
+ print("PDF files are loaded from the folder.")
166
+
167
+
168
+ #Loading HTML files from the html folder in order to create new chroma db
169
+ HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]
170
+
171
+ html_loaders= []
172
+ for html_file in HTML_docs_path_list:
173
+ loader = UnstructuredHTMLLoader(html_file)
174
+ html_loaders.append(loader)
175
+
176
+ html_pages = []
177
+ docs_cannot_load= []
178
+ for loader in html_loaders:
179
+ try:
180
+ html_pages.extend(loader.load())
181
+ except:
182
+ print("Cannot load the file:", loader)
183
+ docs_cannot_load.append(loader)
184
+ print("HTML files are loaded from the folder.")
185
+ # Create text chunks from the PDF docs
186
+ text_splitter = RecursiveCharacterTextSplitter(
187
+ # Set a really small chunk size, just to show.
188
+ chunk_size = 1000,
189
+ chunk_overlap = 200,
190
+ length_function = len,
191
+ is_separator_regex = False,
192
+ )
193
+
194
+ pdf_texts = text_splitter.transform_documents(pdf_pages)
195
+ # Create text chunks from the HTML docs
196
+ html_texts = text_splitter.transform_documents(html_pages)
197
+ # Merging all the text chunks (HTML + PDF)
198
+ all_texts= pdf_texts+html_texts
199
+ print("PDF and HTML docs are split into chunks and created a final list representing all the chunks.")
200
+
201
+ # Create embeddings for all the text chunks and store it in a Chroma DB
202
+ vectorstore = Chroma.from_documents(all_texts,
203
+ embeddings,
204
+ persist_directory=persist_directory)
205
+ vectorstore.persist()
206
+ print("Chroma DB created and loaded")
207
+ return vectorstore
208
+
209
+
210
+ def load_text_chunks(text_chunks_pkl_dir):
211
+ '''
212
+ Loading the pickle file that holds all the documents from the disk.
213
+ If it does not exist, create new one.
214
+ Text documents are required to create BM25 Retriever. But loading all the documents in
215
+ every session will be a time consuming process. So we are storing all the docs in a pickle file
216
+ and load the pickle file from the disk to overcome this problem.
217
+ '''
218
+ try:
219
+ print("Text chunks are loading from the disk")
220
+ with open(text_chunks_pkl_dir, 'rb') as file:
221
+ cached_text_chunks = pickle.load(file)
222
+ # Now, `cached_text_chunks` contains your cached data
223
+ print("Text chunks are loaded from the disk")
224
+ return cached_text_chunks
225
+ except:
226
+ print("Creating text chunks from the docs and caching it.")
227
+ folder_path= unzip_opm()
228
+ pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
229
+ html_folder_path= f"{folder_path}/HTMLs" #html folder
230
+ pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
231
+ pdf_pages = pdf_dir_loader.load()
232
+ HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]
233
+
234
+ html_loaders= []
235
+ for html_file in HTML_docs_path_list:
236
+ loader = UnstructuredHTMLLoader(html_file)
237
+ html_loaders.append(loader)
238
+
239
+ html_pages = []
240
+ for loader in html_loaders:
241
+ try:
242
+ html_pages.extend(loader.load())
243
+ except:
244
+ print("Cannot load the file:", loader)
245
+ all_texts= pdf_pages+html_pages
246
+ # Cache the list to a file
247
+ with open('text_chunks.pkl', 'wb') as file:
248
+ pickle.dump(all_texts, file)
249
+ print("Text chunks are created and cached")
250
+
251
+ def load_ensemble_retriver(text_chunks, embeddings, chroma_vectorstore):
252
+ """Load ensemble retiriever with BM25 and Chroma as individual retrievers"""
253
+ bm25_retriever = BM25Retriever.from_documents(text_chunks)
254
+ bm25_retriever.k = 1
255
+ chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 1})
256
+ ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.3, 0.7])
257
+ return ensemble_retriever
258
+
259
+
260
+ def load_conversational_retrievel_agent(retriever, llm):
261
+ '''Load Conversational Retrievel agent with following tasks as tools,
262
+ 1) OPM Knowledge base query
263
+ 2) INternet search with SerpAPI
264
+ This agent combines RAG, chat interfaces, agents.
265
+ '''
266
+ retriever_tool = create_retriever_tool(
267
+ retriever,
268
+ "Search_US_Office_of_Personnel_Management_Document",
269
+ "Searches and returns documents regarding the U.S. Office of Personnel Management (OPM).")
270
+ search_api = SerpAPIWrapper()
271
+ search_api_tool = Tool(
272
+ name = "Current_Search",
273
+ func=search_api.run,
274
+ description="useful for when you need to answer questions about current events or the current state of the world"
275
+ )
276
+ tools = [retriever_tool]
277
+ agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True, max_token_limit=512)
278
+ return agent_executor
279
+