Runtime error
Runtime error
Browse files
@@ -1,202 +1,287 @@
1 |
from pydantic import NoneStr
2 |
import os
3 |
from langchain.chains.question_answering import load_qa_chain
4 |
from langchain.document_loaders import UnstructuredFileLoader
5 |
from langchain.embeddings.openai import OpenAIEmbeddings
6 |
from langchain.llms import OpenAI
7 |
from langchain.text_splitter import CharacterTextSplitter
8 |
from langchain.vectorstores import FAISS
9 |
from langchain.
10 |
from langchain.chains import ConversationalRetrievalChain
11 |
import gradio as gr
12 |
import openai
13 |
from langchain import PromptTemplate, OpenAI, LLMChain
14 |
import validators
15 |
import requests
16 |
import mimetypes
17 |
import tempfile
18 |
19 |
20 |
21 |
openai.api_key = os.getenv("OPENAI_API_KEY")
22 |
def get_empty_state(self):
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
"""Create a knowledge base from the given documents.
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
# There is no overlap between the chunks
40 |
text_splitter = CharacterTextSplitter(
41 |
separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
return knowledge_base
55 |
56 |
57 |
def upload_file(self,file_paths):
58 |
"""Upload a file and create a knowledge base from its contents.
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
docs = []
73 |
for loader in loaders:
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
return file_paths, {"knowledge_base": knowledge_base}
82 |
83 |
84 |
85 |
86 |
return history, gr.update(value="", interactive=False)
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
def answer_question(self, question,history,state):
120 |
"""Answer a question based on the current knowledge base.
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
def gradio_interface(self):
158 |
159 |
"""Create the Gradio interface for the Chemical Identifier."""
160 |
161 |
with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
162 |
163 |
164 |
<h1 class ="center">
165 |
<img src="file=logo.png" height="110px" width="280px">
166 |
167 |
168 |
169 |
170 |
Virtual Assistant Chatbot
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
1 |
from langchain.text_splitter import CharacterTextSplitter
2 |
from langchain.embeddings import OpenAIEmbeddings
3 |
from langchain.vectorstores import FAISS
4 |
from langchain.chat_models import ChatOpenAI
5 |
from langchain.memory import ConversationBufferMemory
6 |
from langchain.chains import ConversationChain
7 |
from langchain.chains import ConversationalRetrievalChain
8 |
from langchain.document_loaders import UnstructuredFileLoader
9 |
from typing import List, Dict, Tuple
10 |
import gradio as gr
11 |
import validators
12 |
import requests
13 |
import mimetypes
14 |
import tempfile
15 |
import os
16 |
from langchain.chains.question_answering import load_qa_chain
17 |
from langchain.llms import OpenAI
18 |
from langchain.prompts import PromptTemplate
19 |
from langchain.prompts.prompt import PromptTemplate
20 |
import pandas as pd
21 |
from langchain.agents import create_pandas_dataframe_agent
22 |
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
23 |
from langchain import OpenAI, LLMChain
24 |
25 |
class ChatDocumentQA:
26 |
def __init__(self) -> None:
27 |
28 |
29 |
def _get_empty_state(self) -> Dict[str, None]:
30 |
"""Create an empty knowledge base."""
31 |
return {"knowledge_base": None}
32 |
33 |
def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
34 |
"""Extract text content from PDF files.
35 |
36 |
37 |
file_paths (List[str]): List of file paths.
38 |
39 |
40 |
List[str]: Extracted text from the PDFs.
41 |
42 |
docs = []
43 |
loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
44 |
for loader in loaders:
45 |
46 |
return docs
47 |
48 |
def _get_content_from_url(self, urls: str) -> List[str]:
49 |
"""Fetch content from given URLs.
50 |
51 |
52 |
urls (str): Comma-separated URLs.
53 |
54 |
55 |
List[str]: List of text content fetched from the URLs.
56 |
57 |
file_paths = []
58 |
for url in urls.split(','):
59 |
if validators.url(url):
60 |
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36',}
61 |
r = requests.get(url, headers=headers)
62 |
if r.status_code != 200:
63 |
raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
64 |
content_type = r.headers.get("content-type")
65 |
file_extension = mimetypes.guess_extension(content_type)
66 |
temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
67 |
68 |
69 |
70 |
docs = self._extract_text_from_pdfs(file_paths)
71 |
return docs
72 |
73 |
def _split_text_into_chunks(self, text: str) -> List[str]:
74 |
"""Split text into smaller chunks.
75 |
76 |
77 |
text (str): Input text to be split.
78 |
79 |
80 |
List[str]: List of smaller text chunks.
81 |
82 |
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
83 |
84 |
chunks = text_splitter.split_documents(text)
85 |
86 |
return chunks
87 |
def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
88 |
"""Create a vector store from text chunks.
89 |
90 |
91 |
text_chunks (List[str]): List of text chunks.
92 |
93 |
94 |
FAISS: Vector store created from the text chunks.
95 |
96 |
embeddings = OpenAIEmbeddings()
97 |
98 |
return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
99 |
100 |
101 |
def _create_conversation_chain(self,vectorstore):
102 |
103 |
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
104 |
105 |
Chat History: {chat_history}
106 |
Follow Up Input: {question}
107 |
Standalone question:"""
108 |
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
109 |
110 |
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
111 |
112 |
llm = ChatOpenAI(temperature=0)
113 |
114 |
return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
115 |
116 |
117 |
118 |
def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
119 |
"""Build knowledge base from uploaded files.
120 |
121 |
122 |
file_paths (List[str]): List of file paths.
123 |
124 |
125 |
Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
126 |
127 |
file_path = file_paths[0].name
128 |
file_extension = os.path.splitext(file_path)[1]
129 |
130 |
if file_extension == '.pdf':
131 |
pdf_docs = [ for file_path in file_paths]
132 |
raw_text = self._extract_text_from_pdfs(pdf_docs)
133 |
text_chunks = self._split_text_into_chunks(raw_text)
134 |
vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
135 |
return "file uploaded", {"knowledge_base": vectorstore}
136 |
elif file_extension == '.csv':
137 |
df = pd.read_csv(file_path)
138 |
pd_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)
139 |
tools = self.get_agent_tools(pd_agent)
140 |
memory,tools,prompt = self.create_memory_for_csv_qa(tools)
141 |
agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
142 |
return "file uploaded", {"knowledge_base": agent_chain}
143 |
144 |
145 |
return "file uploaded", ""
146 |
147 |
def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
148 |
"""Build knowledge base from URLs.
149 |
150 |
151 |
urls (str): Comma-separated URLs.
152 |
153 |
154 |
Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
155 |
156 |
webpage_text = self._get_content_from_url(urls)
157 |
text_chunks = self._split_text_into_chunks(webpage_text)
158 |
vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
159 |
return "file uploaded", {"knowledge_base": vectorstore}
160 |
161 |
162 |
# csv qa
163 |
164 |
def get_agent_tools(self,agent):
165 |
# search = agent
166 |
tools = [
167 |
168 |
name="dataframe qa",
169 |
170 |
description="useful for when you need to answer questions about table data and dataframe data",
171 |
172 |
173 |
return tools
174 |
175 |
def create_memory_for_csv_qa(self,tools):
176 |
prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
177 |
suffix = """Begin!"
178 |
179 |
180 |
Question: {input}
181 |
182 |
183 |
prompt = ZeroShotAgent.create_prompt(
184 |
185 |
186 |
187 |
input_variables=["input", "chat_history", "agent_scratchpad"],
188 |
189 |
memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
190 |
191 |
return memory,tools,prompt
192 |
193 |
def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
194 |
195 |
llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
196 |
agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
197 |
agent_chain = AgentExecutor.from_agent_and_tools(
198 |
agent=agent, tools=tools, verbose=True, memory=memory
199 |
200 |
201 |
return agent_chain
202 |
203 |
def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
204 |
"""Get a response from the chatbot.
205 |
206 |
207 |
message (str): User's message/question.
208 |
chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
209 |
state (dict): State containing the knowledge base.
210 |
211 |
212 |
Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
213 |
214 |
215 |
if file_paths:
216 |
file_path = file_paths[0].name
217 |
file_extension = os.path.splitext(file_path)[1]
218 |
219 |
if file_extension == ".pdf":
220 |
vectorstore = state["knowledge_base"]
221 |
chat = self._create_conversation_chain(vectorstore)
222 |
# user_ques = {"question": message}
223 |
224 |
response = chat({"question": message,"chat_history": chat_history})
225 |
chat_history.append((message, response["answer"]))
226 |
return "", chat_history
227 |
228 |
elif file_extension == '.csv':
229 |
agent_chain = state["knowledge_base"]
230 |
response = = message)
231 |
chat_history.append((message, response))
232 |
return "", chat_history
233 |
234 |
vectorstore = state["knowledge_base"]
235 |
chat = self._create_conversation_chain(vectorstore)
236 |
# user_ques = {"question": message}
237 |
238 |
response = chat({"question": message,"chat_history": chat_history})
239 |
chat_history.append((message, response["answer"]))
240 |
return "", chat_history
241 |
242 |
chat_history.append((message, "Please Upload Document or URL"))
243 |
return "", chat_history
244 |
245 |
def gradio_interface(self) -> None:
246 |
"""Create a Gradio interface for the chatbot."""
247 |
with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
248 |
gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'>
249 |
250 |
<h1 class ="center">
251 |
<img src="file=logo.png" height="110px" width="280px">
252 |
253 |
254 |
255 |
<h1 style="color:#fff">
256 |
Virtual Assistant Chatbot
257 |
258 |
259 |
state = gr.State(self._get_empty_state())
260 |
chatbot = gr.Chatbot()
261 |
with gr.Row():
262 |
with gr.Column(scale=0.85):
263 |
msg = gr.Textbox(label="Question")
264 |
with gr.Column(scale=0.15):
265 |
file_output = gr.Textbox(label="File Status")
266 |
with gr.Row():
267 |
with gr.Column(scale=0.85):
268 |
clear = gr.ClearButton([msg, chatbot])
269 |
with gr.Column(scale=0.15):
270 |
upload_button = gr.UploadButton(
271 |
"Browse File",
272 |
file_types=[".txt", ".pdf", ".doc", ".docx"],
273 |
file_count="multiple", variant="primary"
274 |
275 |
with gr.Row():
276 |
with gr.Column(scale=1):
277 |
input_url = gr.Textbox(label="urls")
278 |
279 |
input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
280 |
upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
281 |
msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
282 |
283 |
284 |
285 |
if __name__ == "__main__":
286 |
chatdocumentqa = ChatDocumentQA()
287 |