agent_app / rag.py
ridhimamlds's picture
Upload folder using huggingface_hub
30c6ff0 verified
from pydantic import BaseModel, Field
from langchain.tools import Tool
from langchain_community.vectorstores import Neo4jVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub
import os
# Initialize LLM
key = "sk-proj-LdVhjM2bTI27bA3grOK8T3BlbkFJh5whi2UHYKkgM2pNwpbe"
os.environ["OPENAI_API_KEY"] = key
class RAGToolConfig(BaseModel):
NEO4J_URI: str = Field(default="neo4j+s://741a3118.databases.neo4j.io")
NEO4J_USERNAME: str = Field(default="neo4j")
NEO4J_PASSWORD: str = Field(default="XvUolnAXmgx9SG_lRSJuisbDClxi2MiTKGIoBdqN53A")
pdf_path: str = Field(default="/mnt/d/atx/hragent/rag/Sirca_Paints.pdf")
class RAGToolImplementation:
def __init__(self, config: RAGToolConfig, llm):
self.config = config
self.llm = llm # Store the llm instance
self.embedding_model = OpenAIEmbeddings()
self.vectorstore = self._initialize_vectorstore()
self.rag_chain = self._setup_rag_chain()
def _initialize_vectorstore(self):
try:
# Try to load existing vector store
vectorstore = Neo4jVector(
url=self.config.NEO4J_URI,
username=self.config.NEO4J_USERNAME,
password=self.config.NEO4J_PASSWORD,
embedding=self.embedding_model,
index_name="pdf_embeddings",
node_label="PDFChunk",
text_node_property="text",
embedding_node_property="embedding"
)
vectorstore.similarity_search("Test query", k=1)
print("Existing vector store loaded.")
except Exception as e:
print(f"Creating new vector store. Error: {e}")
# Load and process the PDF
loader = PyPDFLoader(self.config.pdf_path)
docs = loader.load()
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Create new vector store
vectorstore = Neo4jVector.from_documents(
documents=splits,
embedding=self.embedding_model,
url=self.config.NEO4J_URI,
username=self.config.NEO4J_USERNAME,
password=self.config.NEO4J_PASSWORD,
index_name="pdf_embeddings",
node_label="PDFChunk",
text_node_property="text",
embedding_node_property="embedding"
)
print("New vector store created and loaded.")
return vectorstore
def _setup_rag_chain(self):
retriever = self.vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| self.llm # Use the llm instance here
| StrOutputParser()
)
return rag_chain
def run(self, query: str) -> str:
try:
response = self.rag_chain.invoke(query)
return response
except Exception as e:
return f"An error occurred while processing the query: {str(e)}"
def create_rag_tool(config: RAGToolConfig = RAGToolConfig(), llm=None):
implementation = RAGToolImplementation(config, llm)
return Tool(
name="RAGTool",
description="Retrieval-Augmented Generation Tool for querying PDF content about Sirca Paints",
func=implementation.run
)
# # Example Usage
# if __name__ == "__main__":
# llm = ChatOpenAI(model="gpt-4", temperature=0)
# rag_tool = create_rag_tool(llm=llm)
# # Test the tool
# result = rag_tool.run("What is spil ethics?")
# print(result)