robertselvam commited on
Commit
f29d7c4
1 Parent(s): 50d0018

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +237 -152
app.py CHANGED
@@ -1,202 +1,287 @@
1
- from pydantic import NoneStr
2
- import os
3
- from langchain.chains.question_answering import load_qa_chain
4
- from langchain.document_loaders import UnstructuredFileLoader
5
- from langchain.embeddings.openai import OpenAIEmbeddings
6
- from langchain.llms import OpenAI
7
  from langchain.text_splitter import CharacterTextSplitter
 
8
  from langchain.vectorstores import FAISS
9
- from langchain.vectorstores import Chroma
 
 
10
  from langchain.chains import ConversationalRetrievalChain
 
 
11
  import gradio as gr
12
- import openai
13
- from langchain import PromptTemplate, OpenAI, LLMChain
14
  import validators
15
  import requests
16
  import mimetypes
17
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- class Chatbot:
20
- def __init__(self):
21
- openai.api_key = os.getenv("OPENAI_API_KEY")
22
- def get_empty_state(self):
23
 
24
- """ Create empty Knowledge base"""
 
25
 
26
- return {"knowledge_base": None}
 
 
 
 
 
 
 
27
 
28
- def create_knowledge_base(self,docs):
 
29
 
30
- """Create a knowledge base from the given documents.
31
  Args:
32
- docs (List[str]): List of documents.
 
33
  Returns:
34
- FAISS: Knowledge base built from the documents.
35
  """
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Initialize a CharacterTextSplitter to split the documents into chunks
38
- # Each chunk has a maximum length of 500 characters
39
- # There is no overlap between the chunks
40
- text_splitter = CharacterTextSplitter(
41
- separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
42
- )
43
 
44
- # Split the documents into chunks using the text_splitter
45
- chunks = text_splitter.split_documents(docs)
46
 
47
- # Initialize an OpenAIEmbeddings model to compute embeddings of the chunks
48
- embeddings = OpenAIEmbeddings()
49
 
50
- # Build a knowledge base using Chroma from the chunks and their embeddings
51
- knowledge_base = Chroma.from_documents(chunks, embeddings)
 
 
52
 
53
- # Return the resulting knowledge base
54
- return knowledge_base
55
 
 
 
 
56
 
57
- def upload_file(self,file_paths):
58
- """Upload a file and create a knowledge base from its contents.
59
  Args:
60
- file_paths : The files to uploaded.
 
61
  Returns:
62
- tuple: A tuple containing the file name and the knowledge base.
63
  """
 
64
 
65
- file_paths = [i.name for i in file_paths]
66
- print(file_paths)
67
 
68
 
69
- loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
70
 
71
- # Load the contents of the file using the loader
72
- docs = []
73
- for loader in loaders:
74
- docs.extend(loader.load())
75
 
76
- # Create a knowledge base from the loaded documents using the create_knowledge_base() method
77
- knowledge_base = self.create_knowledge_base(docs)
 
 
78
 
 
79
 
80
- # Return a tuple containing the file name and the knowledge base
81
- return file_paths, {"knowledge_base": knowledge_base}
82
 
83
- def add_text(self,history, text):
84
- history = history + [(text, None)]
85
- print("History for Add text : ",history)
86
- return history, gr.update(value="", interactive=False)
87
 
 
 
88
 
 
 
89
 
90
- def upload_multiple_urls(self,urls):
91
- urlss = [url.strip() for url in urls.split(',')]
92
- all_docs = []
93
- file_paths = []
94
- for url in urlss:
95
- if validators.url(url):
96
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
97
- r = requests.get(url,headers=headers)
98
- if r.status_code != 200:
99
- raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
100
- content_type = r.headers.get("content-type")
101
- file_extension = mimetypes.guess_extension(content_type)
102
- temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
103
- temp_file.write(r.content)
104
- file_path = temp_file.name
105
- file_paths.append(file_path)
 
 
 
 
 
 
 
 
 
106
 
107
- loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
 
108
 
109
- # Load the contents of the file using the loader
110
- docs = []
111
- for loader in loaders:
112
- docs.extend(loader.load())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- # Create a knowledge base from the loaded documents using the create_knowledge_base() method
115
- knowledge_base = self.create_knowledge_base(docs)
 
 
 
 
 
116
 
117
- return file_paths,{"knowledge_base":knowledge_base}
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- def answer_question(self, question,history,state):
120
- """Answer a question based on the current knowledge base.
121
  Args:
122
- state (dict): The current state containing the knowledge base.
 
 
 
123
  Returns:
124
- str: The answer to the question.
125
  """
126
-
127
- # Retrieve the knowledge base from the state dictionary
128
- knowledge_base = state["knowledge_base"]
129
- retriever = knowledge_base.as_retriever()
130
- qa = ConversationalRetrievalChain.from_llm(
131
- llm=OpenAI(temperature=0.1),
132
- retriever=retriever,
133
- return_source_documents=False)
134
- # Set the question for which we want to find the answer
135
- res = []
136
- question = history[-1][0]
137
- for human, ai in history[:-1]:
138
- pair = (human, ai)
139
- res.append(pair)
140
-
141
- chat_history = []
142
-
143
- query = question
144
- result = qa({"question": query, "chat_history": chat_history})
145
- # Perform a similarity search on the knowledge base to retrieve relevant documents
146
- response = result["answer"]
147
- # Return the response as the answer to the question
148
- history[-1][1] = response
149
- print("History for QA : ",history)
150
- return history
151
-
152
-
153
- def clear_function(self,state):
154
- state.clear()
155
- # state = gr.State(self.get_empty_state())
156
-
157
- def gradio_interface(self):
158
-
159
- """Create the Gradio interface for the Chemical Identifier."""
160
-
161
  with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
162
- gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'>
163
- <center>
164
  <h1 class ="center">
165
  <img src="file=logo.png" height="110px" width="280px">
166
  </h1>
167
- </center>
168
- <be>
169
- <h1 style="color:#fff">
170
  Virtual Assistant Chatbot
171
- </h1>
172
- </center>""")
173
- state = gr.State(self.get_empty_state())
174
- with gr.Column(elem_id="col-container"):
175
- with gr.Accordion("Upload Files", open = False):
176
- with gr.Row(elem_id="row-flex"):
177
- with gr.Row(elem_id="row-flex"):
178
- with gr.Column(scale=1,):
179
- file_url = gr.Textbox(label='file url :',show_label=True, placeholder="")
180
- with gr.Row(elem_id="row-flex"):
181
- with gr.Column(scale=1):
182
- file_output = gr.File()
183
- with gr.Column(scale=1):
184
- upload_button = gr.UploadButton("Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"],file_count = "multiple")
185
- with gr.Row():
186
- chatbot = gr.Chatbot([], elem_id="chatbot")
187
- with gr.Row():
188
- txt = gr.Textbox(label = "Question",show_label=True,placeholder="Enter text and press Enter")
189
- with gr.Row():
190
- clear_btn = gr.Button(value="Clear")
191
-
192
- txt_msg = txt.submit(self.add_text, [chatbot, txt], [chatbot, txt], queue=False).then(self.answer_question, [txt, chatbot, state], chatbot)
193
- txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
194
- file_url.submit(self.upload_multiple_urls, file_url, [file_output, state])
195
- clear_btn.click(self.clear_function,[state],[])
196
- clear_btn.click(lambda: None, None, chatbot, queue=False)
197
- upload_button.upload(self.upload_file, upload_button, [file_output,state])
198
- demo.queue().launch(debug=True)
199
-
200
- if __name__=="__main__":
201
- chatbot = Chatbot()
202
- chatbot.gradio_interface()
 
 
 
 
 
 
 
1
  from langchain.text_splitter import CharacterTextSplitter
2
+ from langchain.embeddings import OpenAIEmbeddings
3
  from langchain.vectorstores import FAISS
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.memory import ConversationBufferMemory
6
+ from langchain.chains import ConversationChain
7
  from langchain.chains import ConversationalRetrievalChain
8
+ from langchain.document_loaders import UnstructuredFileLoader
9
+ from typing import List, Dict, Tuple
10
  import gradio as gr
 
 
11
  import validators
12
  import requests
13
  import mimetypes
14
  import tempfile
15
+ import os
16
+ from langchain.chains.question_answering import load_qa_chain
17
+ from langchain.llms import OpenAI
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain.prompts.prompt import PromptTemplate
20
+ import pandas as pd
21
+ from langchain.agents import create_pandas_dataframe_agent
22
+ from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
23
+ from langchain import OpenAI, LLMChain
24
+
25
+ class ChatDocumentQA:
26
+ def __init__(self) -> None:
27
+ pass
28
+
29
+ def _get_empty_state(self) -> Dict[str, None]:
30
+ """Create an empty knowledge base."""
31
+ return {"knowledge_base": None}
32
 
33
+ def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
34
+ """Extract text content from PDF files.
 
 
35
 
36
+ Args:
37
+ file_paths (List[str]): List of file paths.
38
 
39
+ Returns:
40
+ List[str]: Extracted text from the PDFs.
41
+ """
42
+ docs = []
43
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
44
+ for loader in loaders:
45
+ docs.extend(loader.load())
46
+ return docs
47
 
48
+ def _get_content_from_url(self, urls: str) -> List[str]:
49
+ """Fetch content from given URLs.
50
 
 
51
  Args:
52
+ urls (str): Comma-separated URLs.
53
+
54
  Returns:
55
+ List[str]: List of text content fetched from the URLs.
56
  """
57
+ file_paths = []
58
+ for url in urls.split(','):
59
+ if validators.url(url):
60
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
61
+ r = requests.get(url, headers=headers)
62
+ if r.status_code != 200:
63
+ raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
64
+ content_type = r.headers.get("content-type")
65
+ file_extension = mimetypes.guess_extension(content_type)
66
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
67
+ temp_file.write(r.content)
68
+ file_paths.append(temp_file.name)
69
 
70
+ docs = self._extract_text_from_pdfs(file_paths)
71
+ return docs
 
 
 
 
72
 
73
+ def _split_text_into_chunks(self, text: str) -> List[str]:
74
+ """Split text into smaller chunks.
75
 
76
+ Args:
77
+ text (str): Input text to be split.
78
 
79
+ Returns:
80
+ List[str]: List of smaller text chunks.
81
+ """
82
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
83
 
84
+ chunks = text_splitter.split_documents(text)
 
85
 
86
+ return chunks
87
+ def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
88
+ """Create a vector store from text chunks.
89
 
 
 
90
  Args:
91
+ text_chunks (List[str]): List of text chunks.
92
+
93
  Returns:
94
+ FAISS: Vector store created from the text chunks.
95
  """
96
+ embeddings = OpenAIEmbeddings()
97
 
98
+ return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
 
99
 
100
 
101
+ def _create_conversation_chain(self,vectorstore):
102
 
103
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
 
 
 
104
 
105
+ Chat History: {chat_history}
106
+ Follow Up Input: {question}
107
+ Standalone question:"""
108
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
109
 
110
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
111
 
112
+ llm = ChatOpenAI(temperature=0)
 
113
 
114
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
115
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
116
+ memory=memory)
 
117
 
118
+ def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
119
+ """Build knowledge base from uploaded files.
120
 
121
+ Args:
122
+ file_paths (List[str]): List of file paths.
123
 
124
+ Returns:
125
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
126
+ """
127
+ file_path = file_paths[0].name
128
+ file_extension = os.path.splitext(file_path)[1]
129
+
130
+ if file_extension == '.pdf':
131
+ pdf_docs = [file_path.name for file_path in file_paths]
132
+ raw_text = self._extract_text_from_pdfs(pdf_docs)
133
+ text_chunks = self._split_text_into_chunks(raw_text)
134
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
135
+ return "file uploaded", {"knowledge_base": vectorstore}
136
+ elif file_extension == '.csv':
137
+ df = pd.read_csv(file_path)
138
+ pd_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)
139
+ tools = self.get_agent_tools(pd_agent)
140
+ memory,tools,prompt = self.create_memory_for_csv_qa(tools)
141
+ agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
142
+ return "file uploaded", {"knowledge_base": agent_chain}
143
+
144
+ else:
145
+ return "file uploaded", ""
146
+
147
+ def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
148
+ """Build knowledge base from URLs.
149
 
150
+ Args:
151
+ urls (str): Comma-separated URLs.
152
 
153
+ Returns:
154
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
155
+ """
156
+ webpage_text = self._get_content_from_url(urls)
157
+ text_chunks = self._split_text_into_chunks(webpage_text)
158
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
159
+ return "file uploaded", {"knowledge_base": vectorstore}
160
+
161
+ #************************
162
+ # csv qa
163
+ #************************
164
+ def get_agent_tools(self,agent):
165
+ # search = agent
166
+ tools = [
167
+ Tool(
168
+ name="dataframe qa",
169
+ func=agent.run,
170
+ description="useful for when you need to answer questions about table data and dataframe data",
171
+ )
172
+ ]
173
+ return tools
174
+
175
+ def create_memory_for_csv_qa(self,tools):
176
+ prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
177
+ suffix = """Begin!"
178
+
179
+ {chat_history}
180
+ Question: {input}
181
+ {agent_scratchpad}"""
182
 
183
+ prompt = ZeroShotAgent.create_prompt(
184
+ tools,
185
+ prefix=prefix,
186
+ suffix=suffix,
187
+ input_variables=["input", "chat_history", "agent_scratchpad"],
188
+ )
189
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
190
 
191
+ return memory,tools,prompt
192
+
193
+ def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
194
+
195
+ llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
196
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
197
+ agent_chain = AgentExecutor.from_agent_and_tools(
198
+ agent=agent, tools=tools, verbose=True, memory=memory
199
+ )
200
+
201
+ return agent_chain
202
+
203
+ def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
204
+ """Get a response from the chatbot.
205
 
 
 
206
  Args:
207
+ message (str): User's message/question.
208
+ chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
209
+ state (dict): State containing the knowledge base.
210
+
211
  Returns:
212
+ Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
213
  """
214
+ try:
215
+ if file_paths:
216
+ file_path = file_paths[0].name
217
+ file_extension = os.path.splitext(file_path)[1]
218
+
219
+ if file_extension == ".pdf":
220
+ vectorstore = state["knowledge_base"]
221
+ chat = self._create_conversation_chain(vectorstore)
222
+ # user_ques = {"question": message}
223
+ print("chat_history",chat_history)
224
+ response = chat({"question": message,"chat_history": chat_history})
225
+ chat_history.append((message, response["answer"]))
226
+ return "", chat_history
227
+
228
+ elif file_extension == '.csv':
229
+ agent_chain = state["knowledge_base"]
230
+ response = agent_chain.run(input = message)
231
+ chat_history.append((message, response))
232
+ return "", chat_history
233
+ else:
234
+ vectorstore = state["knowledge_base"]
235
+ chat = self._create_conversation_chain(vectorstore)
236
+ # user_ques = {"question": message}
237
+ print("chat_history",chat_history)
238
+ response = chat({"question": message,"chat_history": chat_history})
239
+ chat_history.append((message, response["answer"]))
240
+ return "", chat_history
241
+ except:
242
+ chat_history.append((message, "Please Upload Document or URL"))
243
+ return "", chat_history
244
+
245
+ def gradio_interface(self) -> None:
246
+ """Create a Gradio interface for the chatbot."""
 
 
247
  with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
248
+ gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'>
249
+ <center>
250
  <h1 class ="center">
251
  <img src="file=logo.png" height="110px" width="280px">
252
  </h1>
253
+ </center>
254
+ <be>
255
+ <h1 style="color:#fff">
256
  Virtual Assistant Chatbot
257
+ </h1>
258
+ </center>""")
259
+ state = gr.State(self._get_empty_state())
260
+ chatbot = gr.Chatbot()
261
+ with gr.Row():
262
+ with gr.Column(scale=0.85):
263
+ msg = gr.Textbox(label="Question")
264
+ with gr.Column(scale=0.15):
265
+ file_output = gr.Textbox(label="File Status")
266
+ with gr.Row():
267
+ with gr.Column(scale=0.85):
268
+ clear = gr.ClearButton([msg, chatbot])
269
+ with gr.Column(scale=0.15):
270
+ upload_button = gr.UploadButton(
271
+ "Browse File",
272
+ file_types=[".txt", ".pdf", ".doc", ".docx"],
273
+ file_count="multiple", variant="primary"
274
+ )
275
+ with gr.Row():
276
+ with gr.Column(scale=1):
277
+ input_url = gr.Textbox(label="urls")
278
+
279
+ input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
280
+ upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
281
+ msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
282
+
283
+ demo.launch()
284
+
285
+ if __name__ == "__main__":
286
+ chatdocumentqa = ChatDocumentQA()
287
+ chatdocumentqa.gradio_interface()