Spaces:
Running
Running
# Importing dependencies | |
from dotenv import load_dotenv | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
from htmlTemplates import css, bot_template, user_template | |
# Load environment variables | |
load_dotenv() | |
# Creating custom template to guide LLM model | |
custom_template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question, in its original language. | |
Chat History: | |
{chat_history} | |
Follow Up Input: {question} | |
Standalone question:""" | |
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) | |
# Extracting text from PDF | |
def get_pdf_text(docs): | |
text = "" | |
for pdf in docs: | |
pdf_reader = PdfReader(pdf) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
# Converting text to chunks | |
def get_chunks(raw_text): | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(raw_text) | |
return chunks | |
# Using Hugging Face embeddings model and FAISS to create vectorstore | |
def get_vectorstore(chunks): | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={'device': 'cpu'} | |
) | |
vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) | |
return vectorstore | |
# Generating conversation chain | |
def get_conversationchain(vectorstore): | |
# Use a Hugging Face model for question-answering | |
model_name = "distilbert-base-uncased-distilled-squad" # Pretrained QA model | |
qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name) | |
def qa_function(question, context): | |
response = qa_pipeline(question=question, context=context) | |
return response['answer'] | |
memory = ConversationBufferMemory( | |
memory_key='chat_history', | |
return_messages=True, | |
output_key='answer' | |
) | |
def conversation_chain(inputs): | |
question = inputs['question'] | |
# Extract text content from Document objects | |
documents = vectorstore.similarity_search(question, k=5) | |
context = "\n".join([doc.page_content for doc in documents]) # Extract `page_content` from each Document | |
answer = qa_function(question, context) | |
memory.save_context({"user_input": question}, {"answer": answer}) | |
return {"chat_history": memory.chat_memory.messages, "answer": answer} | |
return conversation_chain | |
# Generating response from user queries and displaying them accordingly | |
def handle_question(question): | |
response = st.session_state.conversation({'question': question}) | |
st.session_state.chat_history = response["chat_history"] | |
for i, msg in enumerate(st.session_state.chat_history): | |
if i % 2 == 0: | |
st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) | |
else: | |
st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) | |
def main(): | |
st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:") | |
st.write(css, unsafe_allow_html=True) | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
if "chat_history" not in st.session_state: | |
st.session_state.chat_history = None | |
st.header("CSS Edge - Intelligent Document Chatbot :books:") | |
question = st.text_input("Ask a question from your document:") | |
if question: | |
handle_question(question) | |
with st.sidebar: | |
st.subheader("Your documents") | |
docs = st.file_uploader("Upload your PDF here and click on 'Process'", accept_multiple_files=True) | |
if st.button("Process"): | |
with st.spinner("Processing..."): | |
# Get the PDF text | |
raw_text = get_pdf_text(docs) | |
# Get the text chunks | |
text_chunks = get_chunks(raw_text) | |
# Create vectorstore | |
vectorstore = get_vectorstore(text_chunks) | |
# Create conversation chain | |
st.session_state.conversation = get_conversationchain(vectorstore) | |
if __name__ == '__main__': | |
main() | |