harpreetsahota commited on
Commit
d56ca5d
·
1 Parent(s): bc8a7e8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import getpass
4
+ import langchain
5
+ from langchain.document_loaders import WebBaseLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain.embeddings import CacheBackedEmbeddings
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.storage import LocalFileStore
11
+ from typing import List, Union
12
+ import gradio as gr
13
+
14
+ from langchain.chains import ConversationalRetrievalChain
15
+ from langchain.memory import ConversationBufferMemory
16
+ from langchain.chat_models import ChatOpenAI
17
+
18
+ def find_urls(text: str) -> List:
19
+ """
20
+ Extract URLs from a given text.
21
+
22
+ This function looks for patterns starting with 'http://', 'https://', or 'www.'
23
+ followed by any non-whitespace characters. It captures common URL formats
24
+ but might not capture all possible URL variations.
25
+
26
+ Args:
27
+ - text (str): The input string from which URLs need to be extracted.
28
+
29
+ Returns:
30
+ - list: A list containing all the URLs found in the input text.
31
+ """
32
+ # Regular expression to match common URLs and ones starting with 'www.'
33
+ url_pattern = re.compile(r'https?://\S+|www\.\S+')
34
+ return url_pattern.findall(text)
35
+
36
+ def website_loader(website: Union[str, list[str]]) -> List[langchain.schema.document.Document]:
37
+ """
38
+ Loads the specified website(s) into Document objects.
39
+
40
+ This function initiates the WebBaseLoader with the provided website or list of websites,
41
+ loads them, and returns the resulting Document objects.
42
+
43
+ Parameters:
44
+ - website (Union[str, list[str]]): A single website URL as a string or a list of website URLs to be loaded.
45
+
46
+ Returns:
47
+ - List[langchain.schema.document.Document]: A list of Document objects corresponding to the loaded website(s).
48
+ """
49
+
50
+ print("Loading website(s) into Documents...")
51
+ documents = WebBaseLoader(web_path=website).load()
52
+ print("Done loading website(s).")
53
+ return documents
54
+
55
+ def split_text(documents: List) -> List[langchain.schema.document.Document]:
56
+ """
57
+ Splits the provided documents into chunks using RecursiveCharacterTextSplitter.
58
+
59
+ This function takes a list of documents, splits each document into smaller chunks
60
+ of a specified size with a specified overlap, and returns the chunks as a list of
61
+ Document objects.
62
+
63
+ Parameters:
64
+ - documents (List): A list of Document objects to be split into chunks.
65
+
66
+ Returns:
67
+ - List[langchain.schema.document.Document]: A list of Document objects representing the chunks.
68
+
69
+ Note:
70
+ - The chunk size, overlap, and length function are set to 1000, 50, and len respectively. Adjust
71
+ these values if necessary.
72
+ """
73
+ print("Splitting documents into chunks...")
74
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
75
+ chunk_overlap=50,
76
+ length_function=len
77
+ )
78
+ chunks = text_splitter.transform_documents(documents)
79
+ print("Done splitting documents.")
80
+ return chunks
81
+
82
+ def get_document_embeddings(chunks: List) -> langchain.embeddings.cache.CacheBackedEmbeddings:
83
+ """
84
+ Generates and retrieves embeddings for the given document chunks using CacheBackedEmbeddings.
85
+
86
+ This function initializes an embedder backed by a local cache and a core embeddings model
87
+ from OpenAI. It then uses this embedder to generate embeddings for the given document chunks.
88
+
89
+ Parameters:
90
+ - chunks (List): A list of Document chunks for which embeddings are to be generated.
91
+
92
+ Returns:
93
+ - langchain.embeddings.cache.CacheBackedEmbeddings: An embedder which can be used to get
94
+ embeddings for the document chunks.
95
+ """
96
+ print("Creating embedder...")
97
+ store = LocalFileStore("./cache/")
98
+ core_embeddings_model= OpenAIEmbeddings()
99
+ embedder = CacheBackedEmbeddings.from_bytes_store(
100
+ core_embeddings_model,
101
+ store,
102
+ namespace=core_embeddings_model.model
103
+ )
104
+ print("Done creating embedder")
105
+ return embedder
106
+
107
+ def create_vector_store(chunks: List[langchain.schema.document.Document],
108
+ embedder: langchain.embeddings.cache.CacheBackedEmbeddings) -> langchain.vectorstores.faiss.FAISS:
109
+ """
110
+ Creates a FAISS vector store from the given document chunks using the provided embedder.
111
+
112
+ This function uses the provided embedder to transform the document chunks into vectors
113
+ and then stores them in a FAISS vector store.
114
+
115
+ Parameters:
116
+ - chunks (List[langchain.schema.document.Document]): A list of Document chunks to be vectorized.
117
+ - embedder (langchain.embeddings.cache.CacheBackedEmbeddings): An embedder used to generate embeddings
118
+ for the document chunks.
119
+
120
+ Returns:
121
+ - langchain.vectorstores.faiss.FAISS: A FAISS vector store containing the vectors of the document chunks.
122
+ """
123
+ print("Creating vectorstore...")
124
+ vectorstore = FAISS.from_documents(chunks, embedder)
125
+ return vectorstore
126
+
127
+ def create_retriever(vectorstore: langchain.vectorstores) -> langchain.vectorstores.base.VectorStoreRetriever:
128
+ """
129
+ Creates a retriever for the provided FAISS vector store.
130
+
131
+ This function initializes a retriever for the given vector store, allowing for efficient
132
+ querying and retrieval of similar vectors/documents from the vector store.
133
+
134
+ Parameters:
135
+ - vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
136
+
137
+ Returns:
138
+ - langchain.vectorstores.base.VectorStoreRetriever: A retriever object that can be used to query
139
+ and retrieve similar vectors/documents from the vector store.
140
+
141
+ """
142
+ print("Creating vectorstore retriever...")
143
+ retriever = vectorstore.as_retriever()
144
+ return retriever
145
+
146
+ def embed_user_query(query: str) -> List[float]:
147
+ """
148
+ Embeds the provided user query using the OpenAIEmbeddings model.
149
+
150
+ This function takes a user query as input and transforms it into a vector representation
151
+ using the OpenAIEmbeddings model.
152
+
153
+ Parameters:
154
+ - query (str): The user query to be embedded.
155
+
156
+ Returns:
157
+ - List[float]: A list of floats representing the embedded vector of the user query.
158
+ """
159
+ core_embeddings_model = OpenAIEmbeddings()
160
+ embedded_query = core_embeddings_model.embed_query(query)
161
+ return embedded_query
162
+
163
+ def similarity_search(vectorstore: langchain.vectorstores,
164
+ embedded_query: List[float]) -> List[langchain.schema.document.Document]:
165
+ """
166
+ Performs a similarity search on the provided FAISS vector store using an embedded query.
167
+
168
+ This function takes an embedded query and searches the FAISS vector store for the most
169
+ similar vectors/documents based on the embedded query.
170
+
171
+ Parameters:
172
+ - vectorstore (langchain.vectorstores): A FAISS vector store containing vectors of document chunks.
173
+ - embedded_query (List[float]): A list of floats representing the embedded vector of the user query.
174
+
175
+ Returns:
176
+ - List[langchain.schema.document.Document]: A list of Document objects that are the most similar to
177
+ the embedded query.
178
+
179
+ Note:
180
+ - The function currently retrieves the top 4 most similar documents (k=4). Adjust the value of 'k'
181
+ if a different number of results is desired.
182
+ """
183
+ response = vectorstore.similarity_search_by_vector(embedded_query, k=4)
184
+ return response
185
+
186
+
187
+ def create_chatbot(retriever: langchain.vectorstores) -> langchain.chains.conversational_retrieval:
188
+ """
189
+ Initializes and returns a conversational chatbot using the provided retriever and the OpenAI model.
190
+
191
+ This function sets up a chatbot based on the ConversationalRetrievalChain from LangChain,
192
+ which leverages the OpenAI model for conversational interactions and uses the given retriever
193
+ for document retrieval.
194
+
195
+ Parameters:
196
+ - retriever (langchain.vectorstores): A retriever object used for document retrieval based on similarity searches.
197
+
198
+ Returns:
199
+ - langchain.chains.conversational_retrieval: A ConversationalRetrievalChain instance which acts as the chatbot.
200
+
201
+ Note:
202
+
203
+ - The conversation history is stored in the 'chat_history' memory key and is used for context in
204
+ subsequent interactions.
205
+ """
206
+ llm = ChatOpenAI(model="gpt-3.5-turbo")
207
+
208
+ memory = ConversationBufferMemory(
209
+ memory_key='chat_history',
210
+ return_messages=True
211
+ )
212
+
213
+ conversation_chain = ConversationalRetrievalChain.from_llm(
214
+ llm=llm,
215
+ retriever=retriever,
216
+ memory=memory
217
+ )
218
+ return conversation_chain
219
+
220
+ def chat(conversation_chain: langchain.chains.conversational_retrieval, input: str) -> str:
221
+ """
222
+ Interacts with the chatbot using the provided input and returns its response.
223
+
224
+ This function takes a user input, passes it to the chatbot for processing,
225
+ and retrieves the chatbot's response.
226
+
227
+ Parameters:
228
+ - input (str): The user's input/question to the chatbot.
229
+
230
+ Returns:
231
+ - str: The chatbot's response to the user's input.
232
+
233
+ """
234
+ return conversation_chain.run(input)
235
+
236
+
237
+
238
+ # This chatbot_instance will be initialized once a URL is provided.
239
+ chatbot_instance = None
240
+
241
+ def respond(message, chat_history):
242
+ global chatbot_instance
243
+ urls = find_urls(message)
244
+ # If the chatbot is not yet initialized and we have URLs, initialize it
245
+ if not chatbot_instance and urls:
246
+ documents = website_loader(urls)
247
+ chunks = split_text(documents)
248
+ embedder = get_document_embeddings(chunks)
249
+ vectorstore = create_vector_store(chunks, embedder)
250
+ retriever = create_retriever(vectorstore)
251
+ chatbot_instance = create_chatbot(retriever)
252
+ bot_message = "Chatbot initialized! How can I help you?"
253
+ else:
254
+ if chatbot_instance:
255
+ bot_message = chat(message, chatbot_instance)
256
+ else:
257
+ bot_message = "Please provide a URL to initialize the chatbot first."
258
+
259
+ chat_history.append((message, bot_message))
260
+ return "", chat_history
261
+
262
+ with gr.Blocks() as demo:
263
+ chatbot = gr.Chatbot()
264
+ user_query = gr.Textbox(label="Your Query", placeholder="What would you like to chat about?")
265
+ clear = gr.ClearButton([user_query, chatbot])
266
+
267
+ user_query.submit(respond, [user_query, chatbot], [user_query, chatbot])
268
+
269
+ demo.launch()