Spaces:
Runtime error
Runtime error
harpreetsahota
commited on
Commit
·
d56ca5d
1
Parent(s):
bc8a7e8
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import getpass
|
4 |
+
import langchain
|
5 |
+
from langchain.document_loaders import WebBaseLoader
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain.embeddings import CacheBackedEmbeddings
|
8 |
+
from langchain.embeddings import OpenAIEmbeddings
|
9 |
+
from langchain.vectorstores import FAISS
|
10 |
+
from langchain.storage import LocalFileStore
|
11 |
+
from typing import List, Union
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
from langchain.chains import ConversationalRetrievalChain
|
15 |
+
from langchain.memory import ConversationBufferMemory
|
16 |
+
from langchain.chat_models import ChatOpenAI
|
17 |
+
|
18 |
+
def find_urls(text: str) -> List:
|
19 |
+
"""
|
20 |
+
Extract URLs from a given text.
|
21 |
+
|
22 |
+
This function looks for patterns starting with 'http://', 'https://', or 'www.'
|
23 |
+
followed by any non-whitespace characters. It captures common URL formats
|
24 |
+
but might not capture all possible URL variations.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
- text (str): The input string from which URLs need to be extracted.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
- list: A list containing all the URLs found in the input text.
|
31 |
+
"""
|
32 |
+
# Regular expression to match common URLs and ones starting with 'www.'
|
33 |
+
url_pattern = re.compile(r'https?://\S+|www\.\S+')
|
34 |
+
return url_pattern.findall(text)
|
35 |
+
|
36 |
+
def website_loader(website: Union[str, list[str]]) -> List[langchain.schema.document.Document]:
|
37 |
+
"""
|
38 |
+
Loads the specified website(s) into Document objects.
|
39 |
+
|
40 |
+
This function initiates the WebBaseLoader with the provided website or list of websites,
|
41 |
+
loads them, and returns the resulting Document objects.
|
42 |
+
|
43 |
+
Parameters:
|
44 |
+
- website (Union[str, list[str]]): A single website URL as a string or a list of website URLs to be loaded.
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
- List[langchain.schema.document.Document]: A list of Document objects corresponding to the loaded website(s).
|
48 |
+
"""
|
49 |
+
|
50 |
+
print("Loading website(s) into Documents...")
|
51 |
+
documents = WebBaseLoader(web_path=website).load()
|
52 |
+
print("Done loading website(s).")
|
53 |
+
return documents
|
54 |
+
|
55 |
+
def split_text(documents: List) -> List[langchain.schema.document.Document]:
|
56 |
+
"""
|
57 |
+
Splits the provided documents into chunks using RecursiveCharacterTextSplitter.
|
58 |
+
|
59 |
+
This function takes a list of documents, splits each document into smaller chunks
|
60 |
+
of a specified size with a specified overlap, and returns the chunks as a list of
|
61 |
+
Document objects.
|
62 |
+
|
63 |
+
Parameters:
|
64 |
+
- documents (List): A list of Document objects to be split into chunks.
|
65 |
+
|
66 |
+
Returns:
|
67 |
+
- List[langchain.schema.document.Document]: A list of Document objects representing the chunks.
|
68 |
+
|
69 |
+
Note:
|
70 |
+
- The chunk size, overlap, and length function are set to 1000, 50, and len respectively. Adjust
|
71 |
+
these values if necessary.
|
72 |
+
"""
|
73 |
+
print("Splitting documents into chunks...")
|
74 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
|
75 |
+
chunk_overlap=50,
|
76 |
+
length_function=len
|
77 |
+
)
|
78 |
+
chunks = text_splitter.transform_documents(documents)
|
79 |
+
print("Done splitting documents.")
|
80 |
+
return chunks
|
81 |
+
|
82 |
+
def get_document_embeddings(chunks: List) -> langchain.embeddings.cache.CacheBackedEmbeddings:
|
83 |
+
"""
|
84 |
+
Generates and retrieves embeddings for the given document chunks using CacheBackedEmbeddings.
|
85 |
+
|
86 |
+
This function initializes an embedder backed by a local cache and a core embeddings model
|
87 |
+
from OpenAI. It then uses this embedder to generate embeddings for the given document chunks.
|
88 |
+
|
89 |
+
Parameters:
|
90 |
+
- chunks (List): A list of Document chunks for which embeddings are to be generated.
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
- langchain.embeddings.cache.CacheBackedEmbeddings: An embedder which can be used to get
|
94 |
+
embeddings for the document chunks.
|
95 |
+
"""
|
96 |
+
print("Creating embedder...")
|
97 |
+
store = LocalFileStore("./cache/")
|
98 |
+
core_embeddings_model= OpenAIEmbeddings()
|
99 |
+
embedder = CacheBackedEmbeddings.from_bytes_store(
|
100 |
+
core_embeddings_model,
|
101 |
+
store,
|
102 |
+
namespace=core_embeddings_model.model
|
103 |
+
)
|
104 |
+
print("Done creating embedder")
|
105 |
+
return embedder
|
106 |
+
|
107 |
+
def create_vector_store(chunks: List[langchain.schema.document.Document],
|
108 |
+
embedder: langchain.embeddings.cache.CacheBackedEmbeddings) -> langchain.vectorstores.faiss.FAISS:
|
109 |
+
"""
|
110 |
+
Creates a FAISS vector store from the given document chunks using the provided embedder.
|
111 |
+
|
112 |
+
This function uses the provided embedder to transform the document chunks into vectors
|
113 |
+
and then stores them in a FAISS vector store.
|
114 |
+
|
115 |
+
Parameters:
|
116 |
+
- chunks (List[langchain.schema.document.Document]): A list of Document chunks to be vectorized.
|
117 |
+
- embedder (langchain.embeddings.cache.CacheBackedEmbeddings): An embedder used to generate embeddings
|
118 |
+
for the document chunks.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
- langchain.vectorstores.faiss.FAISS: A FAISS vector store containing the vectors of the document chunks.
|
122 |
+
"""
|
123 |
+
print("Creating vectorstore...")
|
124 |
+
vectorstore = FAISS.from_documents(chunks, embedder)
|
125 |
+
return vectorstore
|
126 |
+
|
127 |
+
def create_retriever(vectorstore: langchain.vectorstores) -> langchain.vectorstores.base.VectorStoreRetriever:
|
128 |
+
"""
|
129 |
+
Creates a retriever for the provided FAISS vector store.
|
130 |
+
|
131 |
+
This function initializes a retriever for the given vector store, allowing for efficient
|
132 |
+
querying and retrieval of similar vectors/documents from the vector store.
|
133 |
+
|
134 |
+
Parameters:
|
135 |
+
- vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
- langchain.vectorstores.base.VectorStoreRetriever: A retriever object that can be used to query
|
139 |
+
and retrieve similar vectors/documents from the vector store.
|
140 |
+
|
141 |
+
"""
|
142 |
+
print("Creating vectorstore retriever...")
|
143 |
+
retriever = vectorstore.as_retriever()
|
144 |
+
return retriever
|
145 |
+
|
146 |
+
def embed_user_query(query: str) -> List[float]:
|
147 |
+
"""
|
148 |
+
Embeds the provided user query using the OpenAIEmbeddings model.
|
149 |
+
|
150 |
+
This function takes a user query as input and transforms it into a vector representation
|
151 |
+
using the OpenAIEmbeddings model.
|
152 |
+
|
153 |
+
Parameters:
|
154 |
+
- query (str): The user query to be embedded.
|
155 |
+
|
156 |
+
Returns:
|
157 |
+
- List[float]: A list of floats representing the embedded vector of the user query.
|
158 |
+
"""
|
159 |
+
core_embeddings_model = OpenAIEmbeddings()
|
160 |
+
embedded_query = core_embeddings_model.embed_query(query)
|
161 |
+
return embedded_query
|
162 |
+
|
163 |
+
def similarity_search(vectorstore: langchain.vectorstores,
|
164 |
+
embedded_query: List[float]) -> List[langchain.schema.document.Document]:
|
165 |
+
"""
|
166 |
+
Performs a similarity search on the provided FAISS vector store using an embedded query.
|
167 |
+
|
168 |
+
This function takes an embedded query and searches the FAISS vector store for the most
|
169 |
+
similar vectors/documents based on the embedded query.
|
170 |
+
|
171 |
+
Parameters:
|
172 |
+
- vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
|
173 |
+
- embedded_query (List[float]): A list of floats representing the embedded vector of the user query.
|
174 |
+
|
175 |
+
Returns:
|
176 |
+
- List[langchain.schema.document.Document]: A list of Document objects that are the most similar to
|
177 |
+
the embedded query.
|
178 |
+
|
179 |
+
Note:
|
180 |
+
- The function currently retrieves the top 4 most similar documents (k=4). Adjust the value of 'k'
|
181 |
+
if a different number of results is desired.
|
182 |
+
"""
|
183 |
+
response = vectorstore.similarity_search_by_vector(embedded_query, k=4)
|
184 |
+
return response
|
185 |
+
|
186 |
+
|
187 |
+
def create_chatbot(retriever: langchain.vectorstores) -> langchain.chains.conversational_retrieval:
|
188 |
+
"""
|
189 |
+
Initializes and returns a conversational chatbot using the provided retriever and the OpenAI model.
|
190 |
+
|
191 |
+
This function sets up a chatbot based on the ConversationalRetrievalChain from LangChain,
|
192 |
+
which leverages the OpenAI model for conversational interactions and uses the given retriever
|
193 |
+
for document retrieval.
|
194 |
+
|
195 |
+
Parameters:
|
196 |
+
- retriever (langchain.vectorstores): A retriever object used for document retrieval based on similarity searches.
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
- langchain.chains.conversational_retrieval: A ConversationalRetrievalChain instance which acts as the chatbot.
|
200 |
+
|
201 |
+
Note:
|
202 |
+
|
203 |
+
- The conversation history is stored in the 'chat_history' memory key and is used for context in
|
204 |
+
subsequent interactions.
|
205 |
+
"""
|
206 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo")
|
207 |
+
|
208 |
+
memory = ConversationBufferMemory(
|
209 |
+
memory_key='chat_history',
|
210 |
+
return_messages=True
|
211 |
+
)
|
212 |
+
|
213 |
+
conversation_chain = ConversationalRetrievalChain.from_llm(
|
214 |
+
llm=llm,
|
215 |
+
retriever=retriever,
|
216 |
+
memory=memory
|
217 |
+
)
|
218 |
+
return conversation_chain
|
219 |
+
|
220 |
+
def chat(conversation_chain: langchain.chains.conversational_retrieval, input: str) -> str:
|
221 |
+
"""
|
222 |
+
Interacts with the chatbot using the provided input and returns its response.
|
223 |
+
|
224 |
+
This function takes a user input, passes it to the chatbot for processing,
|
225 |
+
and retrieves the chatbot's response.
|
226 |
+
|
227 |
+
Parameters:
|
228 |
+
- input (str): The user's input/question to the chatbot.
|
229 |
+
|
230 |
+
Returns:
|
231 |
+
- str: The chatbot's response to the user's input.
|
232 |
+
|
233 |
+
"""
|
234 |
+
return conversation_chain.run(input)
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
+
# This chatbot_instance will be initialized once a URL is provided.
|
239 |
+
chatbot_instance = None
|
240 |
+
|
241 |
+
def respond(message, chat_history):
|
242 |
+
global chatbot_instance
|
243 |
+
urls = find_urls(message)
|
244 |
+
# If the chatbot is not yet initialized and we have URLs, initialize it
|
245 |
+
if not chatbot_instance and urls:
|
246 |
+
documents = website_loader(urls)
|
247 |
+
chunks = split_text(documents)
|
248 |
+
embedder = get_document_embeddings(chunks)
|
249 |
+
vectorstore = create_vector_store(chunks, embedder)
|
250 |
+
retriever = create_retriever(vectorstore)
|
251 |
+
chatbot_instance = create_chatbot(retriever)
|
252 |
+
bot_message = "Chatbot initialized! How can I help you?"
|
253 |
+
else:
|
254 |
+
if chatbot_instance:
|
255 |
+
bot_message = chat(message, chatbot_instance)
|
256 |
+
else:
|
257 |
+
bot_message = "Please provide a URL to initialize the chatbot first."
|
258 |
+
|
259 |
+
chat_history.append((message, bot_message))
|
260 |
+
return "", chat_history
|
261 |
+
|
262 |
+
with gr.Blocks() as demo:
|
263 |
+
chatbot = gr.Chatbot()
|
264 |
+
user_query = gr.Textbox(label="Your Query", placeholder="What would you like to chat about?")
|
265 |
+
clear = gr.ClearButton([user_query, chatbot])
|
266 |
+
|
267 |
+
user_query.submit(respond, [user_query, chatbot], [user_query, chatbot])
|
268 |
+
|
269 |
+
demo.launch()
|