import os from typing import Any, List, Mapping, Optional import chainlit as cl from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_community.vectorstores import Chroma from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.llms.base import LLM from llama_cpp import Llama class LlamaCppLLM(LLM): """Custom LangChain wrapper for llama.cpp""" model: Any def __init__(self, model: Llama): super().__init__() self.model = model @property def _llm_type(self) -> str: return "llama.cpp" def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: response = self.model.create_chat_completion( messages=[{"role": "user", "content": prompt}], **kwargs ) return response["choices"][0]["message"]["content"] # Initialize the embedding model embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) # Load the existing Chroma vector store persist_directory = os.path.join(os.path.dirname(__file__), 'mydb') vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings) # Initialize Llama model llama_model = Llama.from_pretrained( repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", filename="Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf", n_ctx=2048, # Context window n_threads=4, # Number of CPU threads to use n_gpu_layers=0 # Set to higher number if using GPU ) # Create LangChain wrapper llm = LlamaCppLLM(model=llama_model) # Create the RAG prompt template template = """You are a helpful AI assistant. Using only the following context, answer the user's question. If you cannot find the answer in the context, say "I don't have enough information to answer this question." Context: {context} Question: {question} Answer: Let me help you with that.""" prompt = ChatPromptTemplate.from_template(template) @cl.on_chat_start async def start(): # Send initial message await cl.Message( content="Hi! I'm ready to answer your questions based on the stored documents. What would you like to know?" ).send() @cl.on_message async def main(message: cl.Message): # Create a loading message msg = cl.Message(content="") await msg.send() # Start typing effect async with cl.Step(name="Searching documents..."): try: # Search the vector store retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # Create the RAG chain rag_chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) # Execute the chain response = await cl.make_async(rag_chain)(message.content) # Update loading message with response await msg.update(content=response) # Show source documents docs = retriever.get_relevant_documents(message.content) elements = [] for i, doc in enumerate(docs): source_name = f"Source {i+1}" elements.append( cl.Text(name=source_name, content=doc.page_content, display="inline") ) if elements: await msg.update(elements=elements) except Exception as e: await msg.update(content=f"An error occurred: {str(e)}") if __name__ == "__main__": cl.run()