File size: 3,843 Bytes
407931b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
# from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from htmlTemplates import css, bot_template, user_template
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFaceHub
import os
def get_pdf_text(pdf_doc):
    text = ""
    for pdf in pdf_doc:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def get_text_chunk(row_text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size = 1000,
        chunk_overlap = 200,
        length_function = len
    )
    chunk = text_splitter.split_text(row_text)
    return chunk


def get_vectorstore(text_chunk):
    embeddings = OpenAIEmbeddings(openai_api_key = os.getenv("OPENAI_API_KEY"))
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vector = FAISS.from_texts(text_chunk,embeddings)
    return vector


def get_conversation_chain(vectorstores):
    llm = ChatOpenAI(openai_api_key = os.getenv("OPENAI_API_KEY"))
    # llm = HuggingFaceHub(repo_id="google/flan-t5-base", model_kwargs={"temperature":0.5, "max_length":512})
    memory = ConversationBufferMemory(memory_key = "chat_history",return_messages = True)
    conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm,
                                                               retriever=vectorstores.as_retriever(),
                                                               memory=memory)
    return conversation_chain


def user_input(user_question):
    response = st.session_state.conversation({"question":user_question})
    st.session_state.chat_history = response["chat_history"]

    for indx, msg in enumerate(st.session_state.chat_history):
        if indx % 2==0:
            st.write(user_template.replace("{{MSG}}",msg.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True)



def main():
    # load secret key
    load_dotenv()
    
    # config the pg
    st.set_page_config(page_title="Chat with multiple PDFs" ,page_icon=":books:")
    st.write(css, unsafe_allow_html=True)
    if "conversation" not in st.session_state:
        st.session_state.conversation = None

    st.header("Chat with multiple PDFs :books:")
    user_question = st.text_input("Ask a question about your docs")
    if user_question:
        user_input(user_question)

    # st.write(user_template.replace("{{MSG}}","Hello Robot"), unsafe_allow_html=True)
    # st.write(bot_template.replace("{{MSG}}","Hello Human"), unsafe_allow_html=True)

    # create side bar
    with st.sidebar:
        st.subheader("Your Documents")
        pdf_doc = st.file_uploader(label="Upload your documents",accept_multiple_files=True)
        if st.button("Process"):
            with st.spinner(text="Processing"):

            # get pdf text
                row_text = get_pdf_text(pdf_doc)
            # get the text chunk
                text_chunk = get_text_chunk(row_text)
                # st.write(text_chunk)
            # create vecor store
                vectorstores = get_vectorstore(text_chunk)
                # st.write(vectorstores)
            # create conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstores)


if __name__ == "__main__":
    main()