Spaces:

harpreetsahota
/

chat-with-website

Runtime error

App Files Files Community

harpreetsahota commited on Sep 15, 2023

Commit

d56ca5d

1 Parent(s): bc8a7e8

Create app.py

Browse files

Files changed (1) hide show

app.py +269 -0

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import os
+import re
+import getpass
+import langchain
+from langchain.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import CacheBackedEmbeddings
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.storage import LocalFileStore
+from typing import List, Union
+import gradio as gr
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+def find_urls(text: str) -> List:
+    """
+    Extract URLs from a given text.
+    This function looks for patterns starting with 'http://', 'https://', or 'www.'
+    followed by any non-whitespace characters. It captures common URL formats
+    but might not capture all possible URL variations.
+    Args:
+    - text (str): The input string from which URLs need to be extracted.
+    Returns:
+    - list: A list containing all the URLs found in the input text.
+    """
+    # Regular expression to match common URLs and ones starting with 'www.'
+    url_pattern = re.compile(r'https?://\S+|www\.\S+')
+    return url_pattern.findall(text)
+def website_loader(website: Union[str, list[str]]) -> List[langchain.schema.document.Document]:
+    """
+    Loads the specified website(s) into Document objects.
+    This function initiates the WebBaseLoader with the provided website or list of websites,
+    loads them, and returns the resulting Document objects.
+    Parameters:
+    - website (Union[str, list[str]]): A single website URL as a string or a list of website URLs to be loaded.
+    Returns:
+    - List[langchain.schema.document.Document]: A list of Document objects corresponding to the loaded website(s).
+    """
+    print("Loading website(s) into Documents...")
+    documents = WebBaseLoader(web_path=website).load()
+    print("Done loading website(s).")
+    return documents
+def split_text(documents: List) -> List[langchain.schema.document.Document]:
+    """
+    Splits the provided documents into chunks using RecursiveCharacterTextSplitter.
+    This function takes a list of documents, splits each document into smaller chunks
+    of a specified size with a specified overlap, and returns the chunks as a list of
+    Document objects.
+    Parameters:
+    - documents (List): A list of Document objects to be split into chunks.
+    Returns:
+    - List[langchain.schema.document.Document]: A list of Document objects representing the chunks.
+    Note:
+    - The chunk size, overlap, and length function are set to 1000, 50, and len respectively. Adjust
+      these values if necessary.
+    """
+    print("Splitting documents into chunks...")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
+                                                   chunk_overlap=50,
+                                                   length_function=len
+                                                   )
+    chunks = text_splitter.transform_documents(documents)
+    print("Done splitting documents.")
+    return chunks
+def get_document_embeddings(chunks: List) -> langchain.embeddings.cache.CacheBackedEmbeddings:
+    """
+    Generates and retrieves embeddings for the given document chunks using CacheBackedEmbeddings.
+    This function initializes an embedder backed by a local cache and a core embeddings model
+    from OpenAI. It then uses this embedder to generate embeddings for the given document chunks.
+    Parameters:
+    - chunks (List): A list of Document chunks for which embeddings are to be generated.
+    Returns:
+    - langchain.embeddings.cache.CacheBackedEmbeddings: An embedder which can be used to get
+      embeddings for the document chunks.
+    """
+    print("Creating embedder...")
+    store = LocalFileStore("./cache/")
+    core_embeddings_model= OpenAIEmbeddings()
+    embedder = CacheBackedEmbeddings.from_bytes_store(
+        core_embeddings_model,
+        store,
+        namespace=core_embeddings_model.model
+    )
+    print("Done creating embedder")
+    return embedder
+def create_vector_store(chunks: List[langchain.schema.document.Document],
+                        embedder: langchain.embeddings.cache.CacheBackedEmbeddings) -> langchain.vectorstores.faiss.FAISS:
+    """
+    Creates a FAISS vector store from the given document chunks using the provided embedder.
+    This function uses the provided embedder to transform the document chunks into vectors
+    and then stores them in a FAISS vector store.
+    Parameters:
+    - chunks (List[langchain.schema.document.Document]): A list of Document chunks to be vectorized.
+    - embedder (langchain.embeddings.cache.CacheBackedEmbeddings): An embedder used to generate embeddings
+      for the document chunks.
+    Returns:
+    - langchain.vectorstores.faiss.FAISS: A FAISS vector store containing the vectors of the document chunks.
+    """
+    print("Creating vectorstore...")
+    vectorstore = FAISS.from_documents(chunks, embedder)
+    return vectorstore
+def create_retriever(vectorstore: langchain.vectorstores) -> langchain.vectorstores.base.VectorStoreRetriever:
+    """
+    Creates a retriever for the provided FAISS vector store.
+    This function initializes a retriever for the given vector store, allowing for efficient
+    querying and retrieval of similar vectors/documents from the vector store.
+    Parameters:
+    - vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
+    Returns:
+    - langchain.vectorstores.base.VectorStoreRetriever: A retriever object that can be used to query
+      and retrieve similar vectors/documents from the vector store.
+    """
+    print("Creating vectorstore retriever...")
+    retriever = vectorstore.as_retriever()
+    return retriever
+def embed_user_query(query: str) -> List[float]:
+    """
+    Embeds the provided user query using the OpenAIEmbeddings model.
+    This function takes a user query as input and transforms it into a vector representation
+    using the OpenAIEmbeddings model.
+    Parameters:
+    - query (str): The user query to be embedded.
+    Returns:
+    - List[float]: A list of floats representing the embedded vector of the user query.
+    """
+    core_embeddings_model = OpenAIEmbeddings()
+    embedded_query = core_embeddings_model.embed_query(query)
+    return embedded_query
+def similarity_search(vectorstore: langchain.vectorstores,
+                      embedded_query: List[float]) -> List[langchain.schema.document.Document]:
+    """
+    Performs a similarity search on the provided FAISS vector store using an embedded query.
+    This function takes an embedded query and searches the FAISS vector store for the most
+    similar vectors/documents based on the embedded query.
+    Parameters:
+    - vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
+    - embedded_query (List[float]): A list of floats representing the embedded vector of the user query.
+    Returns:
+    - List[langchain.schema.document.Document]: A list of Document objects that are the most similar to
+      the embedded query.
+    Note:
+    - The function currently retrieves the top 4 most similar documents (k=4). Adjust the value of 'k'
+      if a different number of results is desired.
+    """
+    response = vectorstore.similarity_search_by_vector(embedded_query, k=4)
+    return response
+def create_chatbot(retriever: langchain.vectorstores) -> langchain.chains.conversational_retrieval:
+    """
+    Initializes and returns a conversational chatbot using the provided retriever and the OpenAI model.
+    This function sets up a chatbot based on the ConversationalRetrievalChain from LangChain,
+    which leverages the OpenAI model for conversational interactions and uses the given retriever
+    for document retrieval.
+    Parameters:
+    - retriever (langchain.vectorstores): A retriever object used for document retrieval based on similarity searches.
+    Returns:
+    - langchain.chains.conversational_retrieval: A ConversationalRetrievalChain instance which acts as the chatbot.
+    Note:
+    - The conversation history is stored in the 'chat_history' memory key and is used for context in
+      subsequent interactions.
+    """
+    llm = ChatOpenAI(model="gpt-3.5-turbo")
+    memory = ConversationBufferMemory(
+        memory_key='chat_history',
+        return_messages=True
+        )
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=retriever,
+        memory=memory
+        )
+    return conversation_chain
+def chat(conversation_chain: langchain.chains.conversational_retrieval, input: str) -> str:
+    """
+    Interacts with the chatbot using the provided input and returns its response.
+    This function takes a user input, passes it to the chatbot for processing,
+    and retrieves the chatbot's response.
+    Parameters:
+    - input (str): The user's input/question to the chatbot.
+    Returns:
+    - str: The chatbot's response to the user's input.
+    """
+    return conversation_chain.run(input)
+# This chatbot_instance will be initialized once a URL is provided.
+chatbot_instance = None
+def respond(message, chat_history):
+    global chatbot_instance
+    urls = find_urls(message)
+    # If the chatbot is not yet initialized and we have URLs, initialize it
+    if not chatbot_instance and urls:
+        documents = website_loader(urls)
+        chunks = split_text(documents)
+        embedder = get_document_embeddings(chunks)
+        vectorstore = create_vector_store(chunks, embedder)
+        retriever = create_retriever(vectorstore)
+        chatbot_instance = create_chatbot(retriever)
+        bot_message = "Chatbot initialized! How can I help you?"
+    else:
+        if chatbot_instance:
+            bot_message = chat(message, chatbot_instance)
+        else:
+            bot_message = "Please provide a URL to initialize the chatbot first."
+    chat_history.append((message, bot_message))
+    return "", chat_history
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot()
+    user_query = gr.Textbox(label="Your Query", placeholder="What would you like to chat about?")
+    clear = gr.ClearButton([user_query, chatbot])
+    user_query.submit(respond, [user_query, chatbot], [user_query, chatbot])
+demo.launch()