# Importing dependencies from dotenv import load_dotenv import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.prompts import PromptTemplate from langchain.memory import ConversationBufferMemory from transformers import pipeline from htmlTemplates import css, bot_template, user_template # Load environment variables load_dotenv() # Creating custom template to guide LLM model custom_template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question, in its original language. Chat History: {chat_history} Follow Up Input: {question} Standalone question:""" CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) # Extracting text from PDF def get_pdf_text(docs): text = "" for pdf in docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text # Converting text to chunks def get_chunks(raw_text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(raw_text) return chunks # Using Hugging Face embeddings model and FAISS to create vectorstore def get_vectorstore(chunks): embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} ) vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) return vectorstore # Generating conversation chain with improved out-of-scope handling def get_conversationchain(vectorstore): # Use a Hugging Face model for question-answering model_name = "distilbert-base-uncased-distilled-squad" # Pretrained QA model qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name) def qa_function(question, context): response = qa_pipeline(question=question, context=context) return response['answer'], response['score'] memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True, output_key='answer' ) def conversation_chain(inputs): question = inputs['question'] # Extract text content from Document objects documents = vectorstore.similarity_search(question, k=5) # If no similar documents are found or similarity is too low if not documents: answer = "Sorry, I couldn't find relevant information in the document. Please ask a question related to the document." memory.save_context({"user_input": question}, {"answer": answer}) return {"chat_history": memory.chat_memory.messages, "answer": answer} context = "\n".join([doc.page_content for doc in documents]) # Extract `page_content` from each Document answer, score = qa_function(question, context) # Define a threshold for confidence (e.g., 0.5) if score < 0.5: answer = "Sorry, I couldn't find relevant information in the document. Please ask a question related to the document." memory.save_context({"user_input": question}, {"answer": answer}) return {"chat_history": memory.chat_memory.messages, "answer": answer} return conversation_chain # Generating response from user queries and displaying them accordingly def handle_question(question): response = st.session_state.conversation({'question': question}) st.session_state.chat_history = response["chat_history"] for i, msg in enumerate(st.session_state.chat_history): if i % 2 == 0: st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) else: st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) def main(): st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") st.write(css, unsafe_allow_html=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None st.header("CSS Edge - Intelligent Document Chatbot :books:") question = st.text_input("Ask a question from your document:") if question: handle_question(question) with st.sidebar: st.subheader("Your documents") docs = st.file_uploader("Upload your PDF here and click on 'Process'", accept_multiple_files=True) if st.button("Process"): with st.spinner("Processing..."): # Get the PDF text raw_text = get_pdf_text(docs) # Get the text chunks text_chunks = get_chunks(raw_text) # Create vectorstore vectorstore = get_vectorstore(text_chunks) # Create conversation chain st.session_state.conversation = get_conversationchain(vectorstore) if __name__ == '__main__': main()