Karthikeyan commited on
Commit
95ce447
·
1 Parent(s): 3151c83

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +245 -0
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import NoneStr
2
+ import os
3
+ from langchain.chains.question_answering import load_qa_chain
4
+ from langchain.document_loaders import UnstructuredFileLoader
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.llms import OpenAI
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.vectorstores import FAISS
9
+ from langchain.vectorstores import Chroma
10
+ from langchain.chains import ConversationalRetrievalChain
11
+ import gradio as gr
12
+ import openai
13
+ from langchain import PromptTemplate, OpenAI, LLMChain
14
+ import validators
15
+ import requests
16
+ import mimetypes
17
+ import tempfile
18
+
19
+ class Chatbot:
20
+ def __init__(self):
21
+ openai.api_key = os.getenv("OPENAI_API_KEY")
22
+ def get_empty_state(self):
23
+
24
+ """ Create empty Knowledge base"""
25
+
26
+ return {"knowledge_base": None}
27
+
28
+ def create_knowledge_base(self,docs):
29
+
30
+ """Create a knowledge base from the given documents.
31
+ Args:
32
+ docs (List[str]): List of documents.
33
+ Returns:
34
+ FAISS: Knowledge base built from the documents.
35
+ """
36
+
37
+ # Initialize a CharacterTextSplitter to split the documents into chunks
38
+ # Each chunk has a maximum length of 500 characters
39
+ # There is no overlap between the chunks
40
+ text_splitter = CharacterTextSplitter(
41
+ separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
42
+ )
43
+
44
+ # Split the documents into chunks using the text_splitter
45
+ chunks = text_splitter.split_documents(docs)
46
+
47
+ # Initialize an OpenAIEmbeddings model to compute embeddings of the chunks
48
+ embeddings = OpenAIEmbeddings()
49
+
50
+ # Build a knowledge base using FAISS from the chunks and their embeddings
51
+ knowledge_base = Chroma.from_documents(chunks, embeddings)
52
+
53
+ # Return the resulting knowledge base
54
+ return knowledge_base
55
+
56
+
57
+ def upload_file(self,file_paths):
58
+ """Upload a file and create a knowledge base from its contents.
59
+ Args:
60
+ file_paths : The files to uploaded.
61
+ Returns:
62
+ tuple: A tuple containing the file name and the knowledge base.
63
+ """
64
+
65
+ file_paths = [i.name for i in file_paths]
66
+ print(file_paths)
67
+
68
+
69
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
70
+
71
+ # Load the contents of the file using the loader
72
+ docs = []
73
+ for loader in loaders:
74
+ docs.extend(loader.load())
75
+
76
+ # Create a knowledge base from the loaded documents using the create_knowledge_base() method
77
+ knowledge_base = self.create_knowledge_base(docs)
78
+
79
+
80
+ # Return a tuple containing the file name and the knowledge base
81
+ return file_paths, {"knowledge_base": knowledge_base}
82
+
83
+ def add_text(self,history, text):
84
+ history = history + [(text, None)]
85
+ return history, gr.update(value="", interactive=False)
86
+
87
+
88
+
89
+ def upload_multiple_urls(self,urls):
90
+ urlss = [url.strip() for url in urls.split(',')]
91
+ all_docs = []
92
+ file_paths = []
93
+ for url in urlss:
94
+ if validators.url(url):
95
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
96
+ r = requests.get(url,headers=headers)
97
+ if r.status_code != 200:
98
+ raise ValueError(
99
+ "Check the url of your file; returned status code %s" % r.status_code
100
+ )
101
+ content_type = r.headers.get("content-type")
102
+ file_extension = mimetypes.guess_extension(content_type)
103
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
104
+ temp_file.write(r.content)
105
+ file_path = temp_file.name
106
+ file_paths.append(file_path)
107
+
108
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
109
+
110
+ # Load the contents of the file using the loader
111
+ docs = []
112
+ for loader in loaders:
113
+ docs.extend(loader.load())
114
+
115
+ # Create a knowledge base from the loaded documents using the create_knowledge_base() method
116
+ knowledge_base = self.create_knowledge_base(docs)
117
+
118
+ return file_paths,{"knowledge_base":knowledge_base}
119
+
120
+ def answer_question(self, question,history,state):
121
+ """Answer a question based on the current knowledge base.
122
+ Args:
123
+ state (dict): The current state containing the knowledge base.
124
+ Returns:
125
+ str: The answer to the question.
126
+ """
127
+
128
+
129
+ # Retrieve the knowledge base from the state dictionary
130
+ knowledge_base = state["knowledge_base"]
131
+ retriever = knowledge_base.as_retriever()
132
+ qa = ConversationalRetrievalChain.from_llm(
133
+ llm=OpenAI(temperature=0.5),
134
+ retriever=retriever,
135
+ return_source_documents=False)
136
+ # Set the question for which we want to find the answer
137
+ res = []
138
+ question = history[-1][0]
139
+ for human, ai in history[:-1]:
140
+ pair = (human, ai)
141
+ res.append(pair)
142
+
143
+ chat_history = res
144
+ #print(chat_history)
145
+ query = question
146
+ result = qa({"question": query, "chat_history": chat_history})
147
+ # Perform a similarity search on the knowledge base to retrieve relevant documents
148
+ response = result["answer"]
149
+ # Return the response as the answer to the question
150
+ history[-1][1] = response
151
+ return history
152
+
153
+
154
+ def extract_excel_data(self,file_path):
155
+ # Read the Excel file
156
+ df = pd.read_excel(file_path)
157
+
158
+ # Flatten the data to a single list
159
+ data_list = []
160
+ for _, row in df.iterrows():
161
+ data_list.extend(row.tolist())
162
+
163
+ return data_list
164
+
165
+ def comparing_chemicals(self,excel_file_path,chemicals):
166
+ chemistry_capability = self.extract_excel_data(excel_file_path.name)
167
+ response = openai.Completion.create(
168
+ engine="text-davinci-003",
169
+ prompt= f"""Analyse the following text delimited by triple backticks to return the comman chemicals.
170
+ text : ```{chemicals} {chemistry_capability}```.
171
+ result should be in bullet points format.
172
+ """,
173
+ max_tokens=100,
174
+ n=1,
175
+ stop=None,
176
+ temperature=0,
177
+ top_p=1.0,
178
+ frequency_penalty=0.0,
179
+ presence_penalty=0.0
180
+ )
181
+
182
+ result = response.choices[0].text.strip()
183
+ return result
184
+
185
+ def clear_function(self,state):
186
+ state.clear()
187
+ # state = gr.State(self.get_empty_state())
188
+
189
+ def gradio_interface(self):
190
+
191
+ """Create the Gradio interface for the Chemical Identifier."""
192
+
193
+ with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
194
+ state = gr.State(self.get_empty_state())
195
+ with gr.Column(elem_id="col-container"):
196
+ gr.HTML(
197
+ """<hr style="border-top: 5px solid white;">"""
198
+ )
199
+ gr.HTML(
200
+ """<br>
201
+ <h1 style="text-align:center;">
202
+ Multi URL and Doc Chatbot Q&A
203
+ </h1> """
204
+ )
205
+ gr.HTML(
206
+ """<hr style="border-top: 5px solid white;">"""
207
+ )
208
+
209
+ gr.Markdown("**Upload your file**")
210
+ with gr.Row(elem_id="row-flex"):
211
+ with gr.Column(scale=1, min_width=0):
212
+ file_url = gr.Textbox(label='file url :',show_label=True, placeholder="")
213
+ with gr.Row(elem_id="row-flex"):
214
+ with gr.Column(scale=0.90, min_width=160):
215
+ file_output = gr.File(elem_classes="heightfit")
216
+ with gr.Column(scale=0.10, min_width=160):
217
+ upload_button = gr.UploadButton(
218
+ "Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"],
219
+ elem_classes="heightfit",
220
+ file_count = "multiple")
221
+ with gr.Row():
222
+ chatbot = gr.Chatbot([], elem_id="chatbot")
223
+ with gr.Row():
224
+ txt = gr.Textbox(
225
+ label = "Question",
226
+ show_label=True,
227
+ placeholder="Enter text and press enter, or upload an image",
228
+ )
229
+ with gr.Row():
230
+ clear_btn = gr.Button(value="Clear")
231
+
232
+ txt_msg = txt.submit(self.add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
233
+ self.answer_question, [txt,chatbot,state], chatbot
234
+ )
235
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
236
+
237
+ file_url.submit(self.upload_multiple_urls, file_url, [file_output, state])
238
+ clear_btn.click(self.clear_function,[state],[])
239
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
240
+ upload_button.upload(self.upload_file, upload_button, [file_output,state])
241
+ demo.queue().launch(debug=True)
242
+
243
+ if __name__=="__main__":
244
+ chatbot = Chatbot()
245
+ chatbot.gradio_interface()