Spaces:
Sleeping
Sleeping
from langchain.document_loaders import ApifyDatasetLoader | |
from langchain.utilities import ApifyWrapper | |
from langchain.docstore.document import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings.cohere import CohereEmbeddings | |
from langchain.vectorstores.deeplake import DeepLake | |
from langchain_cohere import CohereRerank | |
from langchain.retrievers import ContextualCompressionRetriever | |
from langchain.memory import ConversationBufferWindowMemory | |
import os | |
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain | |
from langchain_groq import ChatGroq | |
from dotenv import load_dotenv | |
load_dotenv() | |
def get_and_load_data(): | |
apify_key = os.getenv("apify") | |
apify = ApifyWrapper() | |
loader = apify.call_actor( | |
actor_id="apify/website-content-crawler", | |
run_input={"startUrls": [{"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}]}, | |
dataset_mapping_function=lambda dataset_item: Document( | |
page_content=dataset_item["text"] if dataset_item["text"] else "No content available", | |
metadata={ | |
"source": dataset_item["url"], | |
"title": dataset_item["metadata"]["title"] | |
} | |
), | |
) | |
docs = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=20, length_function=len | |
) | |
docs_split = text_splitter.split_documents(docs) | |
embeddings = CohereEmbeddings(model="embed-english-v2.0") | |
username = "gneyapandya1234" | |
db_id= "educational_chatbot" | |
dbs = DeepLake(dataset_path=f"hub://{username}/{db_id}", embedding_function=embeddings) | |
dbs.add_documents(docs_split) | |
def deeplake(): | |
embeddings= CohereEmbeddings(model = "embed-english-v2.0") | |
dbs = DeepLake( | |
dataset_path="hub://gneyapandya1234/educational_chatbot", | |
read_only=True, | |
embedding_function= embeddings | |
) | |
retriever = dbs.as_retriever() | |
retriever.search_kwargs["distance_metric"] = "cos" | |
retriever.search_kwargs["fetch_k"] = 20 | |
# retriever.search_kwargs["maximal_marginal_relevance"] = True | |
retriever.search_kwargs["k"] = 20 | |
compressor = CohereRerank( | |
model = "rerank-english-v2.0", | |
top_n=5 | |
) | |
compressor_retriever = ContextualCompressionRetriever( | |
base_compressor = compressor , base_retriever=retriever | |
) | |
print("DOne") | |
return dbs, compressor_retriever, retriever | |
def memory(): | |
mem = ConversationBufferWindowMemory( | |
k=3, | |
memory_key="chat_history", | |
return_messages=True, | |
output_key="answer" | |
) | |
return mem | |
def create_llm(): | |
llm = ChatGroq(api_key= os.getenv("GROQ_API_KEY"),model="llama3-70b-8192") | |
return llm | |
def chain(llm,compression_retriever,memory): | |
qa = ConversationalRetrievalChain.from_llm( | |
llm = llm, | |
memory= memory, | |
retriever= compression_retriever, | |
verbose= True, | |
return_source_documents=True | |
) | |
return qa | |
def final_function(): | |
llm = create_llm() | |
mem =memory() | |
dbs, compressor_retriever, retriever = deeplake() | |
qa= chain(llm,compressor_retriever,mem) | |
return qa, mem | |