Spaces:
Running
Running
import os | |
from dotenv import load_dotenv | |
import streamlit as st | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.prompts import PromptTemplate | |
from langchain.memory import ConversationBufferMemory | |
from transformers import pipeline | |
from htmlTemplates import css, bot_template, user_template | |
# Load environment variables | |
load_dotenv() | |
# Creating custom template to guide LLM model | |
custom_template = """Given the following conversation and a follow-up question, rephrase the follow-up question to be a standalone question, in its original language. | |
Chat History: | |
{chat_history} | |
Follow Up Input: {question} | |
Standalone question:""" | |
CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template) | |
# Extracting text from .txt files | |
def get_text_files_content(folder): | |
text = "" | |
for filename in os.listdir(folder): | |
if filename.endswith('.txt'): | |
with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file: | |
text += file.read() + "\n" | |
return text | |
# Converting text to chunks | |
def get_chunks(raw_text): | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(raw_text) | |
return chunks | |
# Using Hugging Face embeddings model and FAISS to create vectorstore | |
def get_vectorstore(chunks): | |
embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
model_kwargs={'device': 'cpu'} | |
) | |
vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings) | |
return vectorstore | |
# Generating conversation chain with improved out-of-scope handling | |
def get_conversationchain(vectorstore): | |
# Use a Hugging Face model for question-answering | |
model_name = "distilbert-base-uncased-distilled-squad" # Pretrained QA model | |
qa_pipeline = pipeline("question-answering", model=model_name, tokenizer=model_name) | |
def qa_function(question, context): | |
response = qa_pipeline(question=question, context=context) | |
return response['answer'], response['score'] | |
memory = ConversationBufferMemory( | |
memory_key='chat_history', | |
return_messages=True, | |
output_key='answer' | |
) | |
def conversation_chain(inputs): | |
question = inputs['question'] | |
# Extract text content from Document objects | |
documents = vectorstore.similarity_search(question, k=5) | |
# If no similar documents are found or similarity is too low | |
if not documents: | |
answer = "Sorry, I couldn't find relevant information in the document. Please ask a question related to the document." | |
memory.save_context({"user_input": question}, {"answer": answer}) | |
return {"chat_history": memory.chat_memory.messages, "answer": answer} | |
context = "\n".join([doc.page_content for doc in documents]) # Extract `page_content` from each Document | |
answer, score = qa_function(question, context) | |
# Define a threshold for confidence (e.g., 0.5) | |
if score < 0.5: | |
answer = "Sorry, I couldn't find relevant information in the document. Please ask a question related to the document." | |
memory.save_context({"user_input": question}, {"answer": answer}) | |
return {"chat_history": memory.chat_memory.messages, "answer": answer} | |
return conversation_chain | |
# Generating response from user queries and displaying them accordingly | |
def handle_question(question): | |
response = st.session_state.conversation({'question': question}) | |
st.session_state.chat_history = response["chat_history"] | |
for i, msg in enumerate(st.session_state.chat_history): | |
if i % 2 == 0: | |
st.write(user_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) | |
else: | |
st.write(bot_template.replace("{{MSG}}", msg.content), unsafe_allow_html=True) | |
def main(): | |
st.set_page_config(page_title="Chat with Notes and AI", page_icon=":books:", layout="wide") | |
st.write(css, unsafe_allow_html=True) | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = None | |
if "chat_history" not in st.session_state: | |
st.session_state.chat_history = None | |
st.header("CSS Edge - Intelligent Document Chatbot with Notes :books:") | |
# Subject selection dropdown | |
subjects = [ | |
"A Trumped World", "Agri Tax in Punjab", "Assad's Fall in Syria", "Elusive National Unity", "Europe and Trump 2.0", | |
"Going Down with Democracy", "Indonesia's Pancasila Philosophy", "Pakistan in Choppy Waters", | |
"Pakistan's Semiconductor Ambitions", "Preserving Pakistan's Cultural Heritage", "Tackling Informal Economy", | |
"Technical Education in Pakistan", "The Case for Solidarity Levies", "The Decline of the Sole Superpower", | |
"The Power of Big Oil", "Trump 2.0 and Pakistan's Emerging Foreign Policy", "Trump and the World 2.0", | |
"Trump vs BRICS", "US-China Trade War", "War on Humanity", "Women's Suppression in Afghanistan" | |
] | |
data_folder = "data" | |
preview_folder = "Preview" | |
subject_folders = {subject: os.path.join(data_folder, subject.replace(' ', '_')) for subject in subjects} | |
preview_folders = {subject: os.path.join(preview_folder, subject.replace(' ', '_')) for subject in subjects} | |
selected_subject = st.sidebar.selectbox("Select a Subject:", subjects) | |
st.sidebar.info(f"You have selected: {selected_subject}") # Display selected subject | |
# Option to upload documents or use preloaded subject data | |
use_preloaded = st.sidebar.radio("Select Data Source:", ("Use Preloaded Notes", "Upload Your Documents")) | |
if use_preloaded == "Use Preloaded Notes": | |
# Load preview content | |
preview_folder_path = preview_folders[selected_subject] | |
if os.path.exists(preview_folder_path): | |
preview_text = get_text_files_content(preview_folder_path) | |
st.subheader("Preview of Notes") | |
st.text_area("Preview Content:", preview_text, height=300, disabled=True) | |
else: | |
st.error(f"No preview available for {selected_subject}.") | |
# Process data folder for question answering | |
subject_folder_path = subject_folders[selected_subject] | |
if os.path.exists(subject_folder_path): | |
raw_text = get_text_files_content(subject_folder_path) | |
if raw_text: | |
text_chunks = get_chunks(raw_text) | |
vectorstore = get_vectorstore(text_chunks) | |
st.session_state.conversation = get_conversationchain(vectorstore) | |
else: | |
st.error("Could not load the content for question answering.") | |
else: | |
st.error(f"No data available for {selected_subject}.") | |
else: # Upload documents option | |
docs = st.sidebar.file_uploader("Upload your text files here:", accept_multiple_files=True, type=['txt']) | |
if docs: | |
st.sidebar.info(f"Uploaded {len(docs)} file(s).") | |
if st.sidebar.button("Process"): | |
with st.spinner("Processing uploaded documents..."): | |
raw_text = "".join([doc.read().decode('utf-8') for doc in docs]) | |
st.subheader("Uploaded Notes Preview") | |
st.text_area("Preview Content:", raw_text, height=300, disabled=True) | |
text_chunks = get_chunks(raw_text) | |
vectorstore = get_vectorstore(text_chunks) | |
st.session_state.conversation = get_conversationchain(vectorstore) | |
# Chat interface | |
question = st.text_input("Ask a question about your selected subject:") | |
if question and st.session_state.conversation: | |
st.write(f"**Subject:** {selected_subject}") # Display subject before chat | |
handle_question(question) | |
elif question: | |
st.warning("Please process a document before asking a question.") | |
if __name__ == '__main__': | |
main() | |