File size: 12,938 Bytes
519c3e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2facc44
 
 
519c3e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2facc44
 
519c3e7
2facc44
 
 
519c3e7
 
5b4b6c8
2facc44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4b6c8
2facc44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import streamlit as st
from pypdf import PdfReader
import os
from pathlib import Path
from dotenv import load_dotenv
import pickle
import timeit
from PIL import Image
import zipfile
import datetime
import shutil
from collections import defaultdict
import pandas as pd

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts.prompt import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
from langchain.utilities import SerpAPIWrapper
from langchain.agents import Tool
from langchain.agents import load_tools
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import RetrievalQA

load_dotenv()


current_timestamp = datetime.datetime.now()
timestamp_string = current_timestamp.strftime("%Y-%m-%d %H:%M:%S")


def build_llm():
    '''
    Loading OpenAI model
    '''
    # llm= OpenAI(temperature=0.2)
    llm= ChatOpenAI(temperature = 0, max_tokens=256)
    return llm

def build_embedding_model():
    '''
    Loading Sentence transformer model for text embedding
    '''
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs={'device': 'cpu'})
    return embeddings

def unzip_opm():
    # Specify the path to your ZIP file
    zip_file_path = r'OPM_Files/OPM_Retirement_backup-20230902T130906Z-001.zip'

    # Get the directory where the ZIP file is located
    extract_path = os.path.dirname(zip_file_path)

    # Create a folder with the same name as the ZIP file (without the .zip extension)
    extract_folder = os.path.splitext(os.path.basename(zip_file_path))[0]
    extract_folder_path = os.path.join(extract_path, extract_folder)

    # Create the folder if it doesn't exist
    if not os.path.exists(extract_folder_path):
        os.makedirs(extract_folder_path)

    # Open the ZIP file for reading
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all the contents into the created folder
        zip_ref.extractall(extract_folder_path)

    print(f'Unzipped {zip_file_path} to {extract_folder_path}')
    return extract_folder_path





    return 

def count_files_by_type(folder_path):
    '''
    Counting files by file type in the specified folder
    '''
    file_count_by_type = defaultdict(int)
    
    for root, _, files in os.walk(folder_path):
        for file in files:
            _, extension = os.path.splitext(file)
            file_count_by_type[extension] += 1
    
    return file_count_by_type

def generate_file_count_table(file_count_by_type):
    '''
    Generate a table files count file type
    '''
    data = {"File Type": [], "Number of Files": []}
    for extension, count in file_count_by_type.items():
        data["File Type"].append(extension)
        data["Number of Files"].append(count)
    
    df = pd.DataFrame(data)
    df = df.sort_values(by="Number of Files", ascending=False)  # Sort by number of files
    return df

def move_files_to_folders(folder_path):
    '''
    Move files to respective folder. Example, PDF docs to PDFs folder, HTML docs to HTMLs folder.
    '''
    for root, _, files in os.walk(folder_path):
        for file in files:
            _, extension = os.path.splitext(file)
            source_path = os.path.join(root, file)
            
            if extension == '.pdf':
                dest_folder = "PDFs"
            elif extension == '.html':
                dest_folder = "HTMLs"
            else:
                continue
            
            dest_path = os.path.join(dest_folder, file)
            os.makedirs(dest_folder, exist_ok=True)
            shutil.copy(source_path, dest_path)



def load_vectorstore(persist_directory, embeddings):
    '''
    This function will try first to load chroma database from the disk. If it does exist,
    It will do the following,
        1) Load the pdfs
        2) create text chunks
        3) Index it and store it in a Chroma DB
        4) Peform the same for HTML files
        5) Store the final chroma db in the disk
    '''
    if os.path.exists(persist_directory):
        print("Using existing vectore store for these documents.")
        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
        print("Chroma DB loaded from the disk")
        return vectorstore
    else:
        folder_path= unzip_opm()
        print("Vector store is not available. Creating new one.")
        file_count_by_type = count_files_by_type(folder_path)
        file_count_table = generate_file_count_table(file_count_by_type)
        print("File Count Table:")
        print(file_count_table)
        #move files into respective folders
        move_files_to_folders(folder_path)
        print("PDF and HTML files copied to separate folders.")
        
        # Load the pdf files from the pdffolder in order to create new chroma db
        pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
        html_folder_path= f"{folder_path}/HTMLs" #html folder
        pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
        pdf_pages = pdf_dir_loader.load()
        print("PDF files are loaded from the folder.")

        
        #Loading HTML files from the html folder in order to create new chroma db 
        HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]

        html_loaders= []
        for html_file in HTML_docs_path_list:
            loader = UnstructuredHTMLLoader(html_file)
            html_loaders.append(loader)

        html_pages = []
        docs_cannot_load= []
        for loader in html_loaders:
            try:
                html_pages.extend(loader.load())
            except:
                print("Cannot load the file:", loader)
                docs_cannot_load.append(loader)
        print("HTML files are loaded from the folder.")
        # Create text chunks from the PDF docs
        text_splitter = RecursiveCharacterTextSplitter(
            # Set a really small chunk size, just to show.
            chunk_size = 1000,
            chunk_overlap  = 200,
            length_function = len,
            is_separator_regex = False,
        )

        pdf_texts = text_splitter.transform_documents(pdf_pages)
        # Create text chunks from the HTML docs
        html_texts = text_splitter.transform_documents(html_pages)
        # Merging all the text chunks (HTML + PDF)
        all_texts= pdf_texts+html_texts
        print("PDF and HTML docs are split into chunks and created a final list representing all the chunks.")

        # Create embeddings for all the text chunks and store it in a Chroma DB
        vectorstore = Chroma.from_documents(all_texts,
                                            embeddings,
                                            persist_directory=persist_directory)
        vectorstore.persist()
        print("Chroma DB created and loaded")
        return vectorstore


def load_text_chunks(text_chunks_pkl_dir):
    '''
    Loading the pickle file that holds all the documents from the disk.
    If it does not exist, create new one.
    Text documents are required to create BM25 Retriever. But loading all the documents in
    every session will be a time consuming process. So we are storing all the docs in a pickle file
    and load the pickle file from the disk to overcome this problem.
    '''
    try:
        print("Text chunks are loading from the disk")
        with open(text_chunks_pkl_dir, 'rb') as file:
            cached_text_chunks = pickle.load(file)
        # Now, `cached_text_chunks` contains your cached data
        print("Text chunks are loaded from the disk")
        return cached_text_chunks
    except:
        print("Creating text chunks from the docs and caching it.")
        folder_path= unzip_opm()
        pdf_folder_path= f"{folder_path}/PDFs" #pdf folder
        html_folder_path= f"{folder_path}/HTMLs" #html folder
        pdf_dir_loader = PyPDFDirectoryLoader(pdf_folder_path)
        pdf_pages = pdf_dir_loader.load()
        HTML_docs_path_list = [os.path.join(html_folder_path, f) for f in os.listdir(html_folder_path) if os.path.isfile(os.path.join(html_folder_path, f))]

        html_loaders= []
        for html_file in HTML_docs_path_list:
            loader = UnstructuredHTMLLoader(html_file)
            html_loaders.append(loader)

        html_pages = []
        for loader in html_loaders:
            try:
                html_pages.extend(loader.load())
            except:
                print("Cannot load the file:", loader)
        all_texts= pdf_pages+html_pages
       # Cache the list to a file
        with open('text_chunks.pkl', 'wb') as file:
            pickle.dump(all_texts, file)
        print("Text chunks are created and cached")
        
def load_ensemble_retriver(text_chunks, embeddings, chroma_vectorstore):
    """Load ensemble retiriever with BM25 and Chroma as individual retrievers"""
    bm25_retriever = BM25Retriever.from_documents(text_chunks)
    bm25_retriever.k = 1
    chroma_retriever = chroma_vectorstore.as_retriever(search_kwargs={"k": 1})  
    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.3, 0.7])
    retriever_from_llm = MultiQueryRetriever.from_llm(retriever=ensemble_retriever, llm=ChatOpenAI()
)
    return retriever_from_llm


def load_conversational_retrievel_chain(retriever, llm):
    '''Load Conversational Retrievel agent with following tasks as tools,
    1) OPM Knowledge base query
    2) INternet search with SerpAPI
    This agent combines RAG, chat interfaces, agents.
    '''
    # retriever_tool = create_retriever_tool(
    # retriever, 
    # "Search_US_Office_of_Personnel_Management_Document",
    # "Searches and returns documents regarding the U.S. Office of Personnel Management (OPM).")
    # search_api = SerpAPIWrapper()
    # search_api_tool = Tool(
    #         name = "Current_Search",
    #         func=search_api.run,
    #         description="useful for when you need to answer questions about current events or the current state of the world"
    #     )
    # tools = [retriever_tool]
    # agent_executor = create_conversational_retrieval_agent(llm, tools, verbose=True, max_token_limit=512)
    # return agent_executor
  # string_dialogue = "You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
    # _template= """
    # You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'.
    # Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
    # Your answer should in English language only.
    # Chat History:
    # {chat_history}
    # Follow Up Input: {question}
    # Standalone question:"""

    # CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
    # memory = ConversationBufferMemory(return_messages=True,memory_key="chat_history")
    # conversation_chain = ConversationalRetrievalChain.from_llm(
    #     llm=st.session_state["llm"],
    #     retriever=st.session_state["ensemble_retriver"],
    #     condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    #     memory=memory,
    #     verbose=True,
    # )
    template = """You are a helpful assistant. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'.
    Use the following pieces of context to answer the question at the end. If you don't know the answer,\
    just say that you don't know, don't try to make up an answer.

    {context}

    {history}
    Question: {question}
    Helpful Answer:"""

    prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
    memory = ConversationBufferMemory(input_key="question", memory_key="history")

    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt, "memory": memory},
    )
    return qa