File size: 5,476 Bytes
0317d24
ffc4ebe
0317d24
 
ffc4ebe
0317d24
 
 
 
 
 
 
 
 
ffc4ebe
0317d24
 
ffc4ebe
0317d24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c15a5f1
0317d24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c15a5f1
0317d24
 
c15a5f1
0317d24
 
 
 
 
 
 
 
c15a5f1
0317d24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c15a5f1
0317d24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from htmlTemplates import css, bot_template, user_template
import logging
import faiss

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))


def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        try:
            pdf_reader = PdfReader(pdf)
            for page in pdf_reader.pages:
                text += page.extract_text()
        except Exception as e:
            logging.error(f"Error processing PDF file: {e}")
    return text

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=10000,
        chunk_overlap=1000
    )
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks):
    logging.info("Starting vector store creation")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    logging.info("Embeddings created")
    
    # Create the FAISS vector store
    vector_store = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    logging.info("FAISS vector store created")

    # Define the directory where the FAISS index will be saved
    faiss_index_dir = os.path.join(os.path.dirname(__file__), "faiss_index")
    os.makedirs(faiss_index_dir, exist_ok=True)

    # Save the entire FAISS vector store, including the docstore and index_to_docstore_id
    vector_store.save_local(faiss_index_dir)
    logging.info("FAISS vector store saved successfully.")

def get_conversation_chain():
    prompt_template = """
        Answer the question clear and precise. If not provided the context return the result as
        "Sorry I dont know the answer", don't provide the wrong answer.
        Context:\n {context}?\n
        Question:\n{question}\n
        Answer:
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    chain = load_qa_chain(model, chain_type='stuff', prompt=prompt)
    return chain

def user_input(user_question):
    logging.info("Processing user input")
    
    # Reload the FAISS vector store from the saved directory
    faiss_index_dir = os.path.join(os.path.dirname(__file__), "faiss_index")
    
    if not os.path.exists(faiss_index_dir):
        st.warning("Please upload and process PDF files before asking questions.")
        return

    try:
        # Load the entire FAISS vector store, enabling dangerous deserialization since we trust the source
        new_db = FAISS.load_local(faiss_index_dir, GoogleGenerativeAIEmbeddings(model='models/embedding-001'), allow_dangerous_deserialization=True)
        logging.info("FAISS vector store loaded successfully")
        
        # Perform similarity search and generate response
        docs = new_db.similarity_search(user_question)
        chain = get_conversation_chain()
        response = chain({"input_documents": docs, "question": user_question}, return_only_outputs=True)
        st.write(user_template.replace("{{MSG}}", response["output_text"]), unsafe_allow_html=True)
    except Exception as e:
        logging.error(f"Error processing user input: {e}")
        st.write(bot_template.replace("{{MSG}}", f"Sorry, there was an error processing your request: {str(e)}. Please try again later."), unsafe_allow_html=True)

def main():
    st.set_page_config(page_title="Chat with multiple PDFs",
                       page_icon=":books:")
    st.write(css, unsafe_allow_html=True)

    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("Chat with multiple PDFs with Gemini Pro :books:")
    
    with st.sidebar:
        pdf_docs = st.file_uploader(
            "Upload your PDF Files and Click on Process",
            accept_multiple_files=True
        )
        if st.button("Process"):
            with st.spinner("Processing..."):
                try:
                    raw_text = get_pdf_text(pdf_docs)
                    text_chunks = get_text_chunks(raw_text)
                    get_vector_store(text_chunks)
                    st.session_state.conversation = get_conversation_chain()
                    st.success("PDFs processed successfully. You can now ask questions.")
                except Exception as e:
                    logging.error(f"Error processing PDF files: {e}")
                    st.error("There was an error processing the PDF files. Please try again later.")

    user_question = st.text_input("Ask a Question from the PDF Files")
    if user_question:
        if not os.path.exists(os.path.join(os.path.dirname(__file__), "faiss_index", "index.faiss")):
            st.warning("Please upload and process PDF files before asking questions.")
        else:
            user_input(user_question)

if __name__ == "__main__":
    main()