danicafisher's picture
Update app.py
008dbaf verified
raw
history blame
3.07 kB
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain_community.vectorstores import Qdrant
from langchain.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from operator import itemgetter
import chainlit as cl
# Load all the documents in the directory
documents = []
directory = "data/"
for filename in os.listdir(directory):
if filename.endswith(".pdf"): # Check if the file is a PDF
file_path = os.path.join(directory, filename)
loader = PyMuPDFLoader(file_path)
docs = loader.load()
documents.extend(docs)
# Split the documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=40,
length_function=len,
is_separator_regex=False
)
rag_documents = text_splitter.split_documents(documents)
# # Alternative chunking: Tokens (more accurate for OpenAI models)
# token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
# encoding="cl100k_base", chunk_size=100, chunk_overlap=0
# )
# token_rag_documents = token_text_splitter.split_documents(documents)
# # TO DO ^^ test
# Split the documents by character
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
character_rag_documents = text_splitter.split_documents(documents)
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
# Create the vector store
vectorstore = Qdrant.from_documents(
rag_documents,
embedding,
location=":memory:",
collection_name="Implications of AI",
)
retriever = vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4")
# @cl.cache_resource
@cl.on_chat_start
async def start_chat():
template = """
Use the provided context to answer the user's query.
You may not answer the user's query unless there is specific context in the following text.
If you do not know the answer, or cannot answer, please respond with "I don't know".
Question:
{question}
Context:
{context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
base_chain = (
{"context": itemgetter("question") | retriever, "question": itemgetter("question")}
| prompt | llm | StrOutputParser()
)
cl.user_session.set("chain", base_chain)
@cl.on_message
async def main(message):
chain = cl.user_session.get("chain")
result = chain.invoke({"question":message.content})
msg = cl.Message(content=result)
async for stream_resp in result["response"]:
await msg.stream_token(stream_resp)
await msg.send()