# Importing dependencies from dotenv import load_dotenv import streamlit as st from PyPDF2 import PdfReader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.prompts import PromptTemplate from langchain.memory import ConversationBufferMemory from langchain.chains import ConversationalRetrievalChain from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer from htmlTemplates import css, bot_template, user_template # Load environment variables load_dotenv() # Creating custom template to guide LLM model custom_template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question, in its original language. Chat History: {chat_history} Follow Up Input: {question} Standalone question:""" CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) # Extracting text from PDF def get_pdf_text(docs): text = "" for pdf in docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text # Converting text to chunks def get_chunks(raw_text): text_splitter = CharacterTextSplitter( separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(raw_text) return chunks # Using Hugging Face embeddings model and FAISS to create vectorstore def get_vectorstore(chunks): embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'} ) vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) return vectorstore # Generating conversation chain def get_conversationchain(vectorstore): # Use a Hugging Face model for question-answering model_name = "distilbert-base-uncased-distilled-squad" # Pretrained QA model qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name) def qa_function(question, context): response = qa_pipeline(question=question, context=context) return response['answer'] memory = ConversationBufferMemory( memory_key='chat_history', return_messages=True, output_key='answer' ) def conversation_chain(inputs): question = inputs['question'] # Extract text content from Document objects documents = vectorstore.similarity_search(question, k=5) if not documents: answer = "Sorry, I couldn't find relevant information in the document. Please ask a question related to the document." memory.save_context({"user_input": question}, {"answer": answer}) return {"chat_history": memory.chat_memory.messages, "answer": answer} context = "\n".join([doc.page_content for doc in documents]) # Extract `page_content` from each Document answer = qa_function(question, context) memory.save_context({"user_input": question}, {"answer": answer}) return {"chat_history": memory.chat_memory.messages, "answer": answer} return conversation_chain # Generating response from user queries and displaying them accordingly def handle_question(question): response = st.session_state.conversation({'question': question}) st.session_state.chat_history = response["chat_history"] for i, msg in enumerate(st.session_state.chat_history): if i % 2 == 0: st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) else: st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) def main(): st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") st.write(css, unsafe_allow_html=True) if "conversation" not in st.session_state: st.session_state.conversation = None if "chat_history" not in st.session_state: st.session_state.chat_history = None st.header("CSS Edge - Intelligent Document Chatbot :books:") question = st.text_input("Ask a question from your document:") if question: handle_question(question) with st.sidebar: st.subheader("Your documents") docs = st.file_uploader("Upload your PDF here and click on 'Process'", accept_multiple_files=True) if st.button("Process"): with st.spinner("Processing..."): # Get the PDF text raw_text = get_pdf_text(docs) # Get the text chunks text_chunks = get_chunks(raw_text) # Create vectorstore vectorstore = get_vectorstore(text_chunks) # Create conversation chain st.session_state.conversation = get_conversationchain(vectorstore) if __name__ == '__main__': main()