from langchain.document_loaders import ApifyDatasetLoader from langchain.utilities import ApifyWrapper from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.cohere import CohereEmbeddings from langchain.vectorstores.deeplake import DeepLake from langchain_cohere import CohereRerank from langchain.retrievers import ContextualCompressionRetriever from langchain.memory import ConversationBufferWindowMemory import os from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain from langchain_groq import ChatGroq from dotenv import load_dotenv load_dotenv() def get_and_load_data(): apify_key = os.getenv("apify") apify = ApifyWrapper() loader = apify.call_actor( actor_id="apify/website-content-crawler", run_input={"startUrls": [{"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}]}, dataset_mapping_function=lambda dataset_item: Document( page_content=dataset_item["text"] if dataset_item["text"] else "No content available", metadata={ "source": dataset_item["url"], "title": dataset_item["metadata"]["title"] } ), ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=20, length_function=len ) docs_split = text_splitter.split_documents(docs) embeddings = CohereEmbeddings(model="embed-english-v2.0") username = "gneyapandya1234" db_id= "educational_chatbot" dbs = DeepLake(dataset_path=f"hub://{username}/{db_id}", embedding_function=embeddings) dbs.add_documents(docs_split) def deeplake(): embeddings= CohereEmbeddings(model = "embed-english-v2.0") dbs = DeepLake( dataset_path="hub://gneyapandya1234/educational_chatbot", read_only=True, embedding_function= embeddings ) retriever = dbs.as_retriever() retriever.search_kwargs["distance_metric"] = "cos" retriever.search_kwargs["fetch_k"] = 20 # retriever.search_kwargs["maximal_marginal_relevance"] = True retriever.search_kwargs["k"] = 20 compressor = CohereRerank( model = "rerank-english-v2.0", top_n=5 ) compressor_retriever = ContextualCompressionRetriever( base_compressor = compressor , base_retriever=retriever ) print("DOne") return dbs, compressor_retriever, retriever def memory(): mem = ConversationBufferWindowMemory( k=3, memory_key="chat_history", return_messages=True, output_key="answer" ) return mem def create_llm(): llm = ChatGroq(api_key= os.getenv("GROQ_API_KEY"),model="llama3-70b-8192") return llm def chain(llm,compression_retriever,memory): qa = ConversationalRetrievalChain.from_llm( llm = llm, memory= memory, retriever= compression_retriever, verbose= True, return_source_documents=True ) return qa def final_function(): llm = create_llm() mem =memory() dbs, compressor_retriever, retriever = deeplake() qa= chain(llm,compressor_retriever,mem) return qa, mem