Multi-Doc-Virtual-Chatbot

Runtime error

App Files Files Community

robertselvam commited on Sep 6, 2023

Commit

f29d7c4

1 Parent(s): 50d0018

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -152

app.py CHANGED Viewed

@@ -1,202 +1,287 @@
-from pydantic import NoneStr
-import os
-from langchain.chains.question_answering import load_qa_chain
-from langchain.document_loaders import UnstructuredFileLoader
-from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.llms import OpenAI
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
-from langchain.vectorstores import Chroma
 from langchain.chains import ConversationalRetrievalChain
 import gradio as gr
-import openai
-from langchain import PromptTemplate, OpenAI, LLMChain
 import validators
 import requests
 import mimetypes
 import tempfile
-class Chatbot:
-    def __init__(self):
-        openai.api_key = os.getenv("OPENAI_API_KEY")
-    def get_empty_state(self):
-        """ Create empty Knowledge base"""
-        return {"knowledge_base": None}
-    def create_knowledge_base(self,docs):
-        """Create a knowledge base from the given documents.
         Args:
-            docs (List[str]): List of documents.
         Returns:
-            FAISS: Knowledge base built from the documents.
         """
-        # Initialize a CharacterTextSplitter to split the documents into chunks
-        # Each chunk has a maximum length of 500 characters
-        # There is no overlap between the chunks
-        text_splitter = CharacterTextSplitter(
-            separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
-        )
-        # Split the documents into chunks using the text_splitter
-        chunks = text_splitter.split_documents(docs)
-        # Initialize an OpenAIEmbeddings model to compute embeddings of the chunks
-        embeddings = OpenAIEmbeddings()
-        # Build a knowledge base using Chroma from the chunks and their embeddings
-        knowledge_base = Chroma.from_documents(chunks, embeddings)
-        # Return the resulting knowledge base
-        return knowledge_base
-    def upload_file(self,file_paths):
-        """Upload a file and create a knowledge base from its contents.
         Args:
-            file_paths : The files to uploaded.
         Returns:
-            tuple: A tuple containing the file name and the knowledge base.
         """
-        file_paths = [i.name for i in file_paths]
-        print(file_paths)
-        loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
-        # Load the contents of the file using the loader
-        docs = []
-        for loader in loaders:
-            docs.extend(loader.load())
-        # Create a knowledge base from the loaded documents using the create_knowledge_base() method
-        knowledge_base = self.create_knowledge_base(docs)
-        # Return a tuple containing the file name and the knowledge base
-        return file_paths, {"knowledge_base": knowledge_base}
-    def add_text(self,history, text):
-        history = history + [(text, None)]
-        print("History for Add text : ",history)
-        return history, gr.update(value="", interactive=False)
-    def upload_multiple_urls(self,urls):
-        urlss = [url.strip() for url in urls.split(',')]
-        all_docs = []
-        file_paths = []
-        for url in urlss:
-            if validators.url(url):
-                headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
-                r = requests.get(url,headers=headers)
-                if r.status_code != 200:
-                    raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
-                content_type = r.headers.get("content-type")
-                file_extension = mimetypes.guess_extension(content_type)
-                temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
-                temp_file.write(r.content)
-                file_path = temp_file.name
-                file_paths.append(file_path)
-        loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
-        # Load the contents of the file using the loader
-        docs = []
-        for loader in loaders:
-            docs.extend(loader.load())
-        # Create a knowledge base from the loaded documents using the create_knowledge_base() method
-        knowledge_base = self.create_knowledge_base(docs)
-        return file_paths,{"knowledge_base":knowledge_base}
-    def answer_question(self, question,history,state):
-        """Answer a question based on the current knowledge base.
         Args:
-            state (dict): The current state containing the knowledge base.
         Returns:
-            str: The answer to the question.
         """
-        # Retrieve the knowledge base from the state dictionary
-        knowledge_base = state["knowledge_base"]
-        retriever = knowledge_base.as_retriever()
-        qa = ConversationalRetrievalChain.from_llm(
-            llm=OpenAI(temperature=0.1),
-            retriever=retriever,
-            return_source_documents=False)
-        # Set the question for which we want to find the answer
-        res = []
-        question = history[-1][0]
-        for human, ai in history[:-1]:
-            pair = (human, ai)
-            res.append(pair)
-        chat_history = []
-        query = question
-        result = qa({"question": query, "chat_history": chat_history})
-        # Perform a similarity search on the knowledge base to retrieve relevant documents
-        response = result["answer"]
-        # Return the response as the answer to the question
-        history[-1][1] = response
-        print("History for QA : ",history)
-        return history
-    def clear_function(self,state):
-      state.clear()
-      # state = gr.State(self.get_empty_state())
-    def gradio_interface(self):
-        """Create the Gradio interface for the Chemical Identifier."""
         with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
-          gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'>
-           <center>
               <h1 class ="center">
                    <img src="file=logo.png" height="110px" width="280px">
               </h1>
-           </center>
-           <be>
-           <h1 style="color:#fff">
                Virtual Assistant Chatbot
-           </h1>
-           </center>""")
-          state = gr.State(self.get_empty_state())
-          with gr.Column(elem_id="col-container"):
-              with gr.Accordion("Upload Files", open = False):
-                  with gr.Row(elem_id="row-flex"):
-                      with gr.Row(elem_id="row-flex"):
-                          with gr.Column(scale=1,):
-                              file_url = gr.Textbox(label='file url :',show_label=True, placeholder="")
-                      with gr.Row(elem_id="row-flex"):
-                          with gr.Column(scale=1):
-                              file_output = gr.File()
-                          with gr.Column(scale=1):
-                              upload_button = gr.UploadButton("Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"],file_count = "multiple")
-              with gr.Row():
-                chatbot = gr.Chatbot([], elem_id="chatbot")
-              with gr.Row():
-                txt = gr.Textbox(label = "Question",show_label=True,placeholder="Enter text and press Enter")
-              with gr.Row():
-                clear_btn = gr.Button(value="Clear")
-          txt_msg = txt.submit(self.add_text, [chatbot, txt], [chatbot, txt], queue=False).then(self.answer_question, [txt, chatbot, state], chatbot)
-          txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
-          file_url.submit(self.upload_multiple_urls, file_url, [file_output, state])
-          clear_btn.click(self.clear_function,[state],[])
-          clear_btn.click(lambda: None, None, chatbot, queue=False)
-          upload_button.upload(self.upload_file, upload_button, [file_output,state])
-        demo.queue().launch(debug=True)
-if __name__=="__main__":
-    chatbot = Chatbot()
-    chatbot.gradio_interface()

 from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationChain
 from langchain.chains import ConversationalRetrievalChain
+from langchain.document_loaders import UnstructuredFileLoader
+from typing import List, Dict, Tuple
 import gradio as gr
 import validators
 import requests
 import mimetypes
 import tempfile
+import os
+from langchain.chains.question_answering import load_qa_chain
+from langchain.llms import OpenAI
+from langchain.prompts import PromptTemplate
+from langchain.prompts.prompt import PromptTemplate
+import pandas as pd
+from langchain.agents import create_pandas_dataframe_agent
+from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
+from langchain import OpenAI, LLMChain
+class ChatDocumentQA:
+    def __init__(self) -> None:
+        pass
+    def _get_empty_state(self) -> Dict[str, None]:
+        """Create an empty knowledge base."""
+        return {"knowledge_base": None}
+    def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
+        """Extract text content from PDF files.
+        Args:
+            file_paths (List[str]): List of file paths.
+        Returns:
+            List[str]: Extracted text from the PDFs.
+        """
+        docs = []
+        loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
+        for loader in loaders:
+            docs.extend(loader.load())
+        return docs
+    def _get_content_from_url(self, urls: str) -> List[str]:
+        """Fetch content from given URLs.
         Args:
+            urls (str): Comma-separated URLs.
         Returns:
+            List[str]: List of text content fetched from the URLs.
         """
+        file_paths = []
+        for url in urls.split(','):
+            if validators.url(url):
+                headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
+                r = requests.get(url, headers=headers)
+                if r.status_code != 200:
+                    raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
+                content_type = r.headers.get("content-type")
+                file_extension = mimetypes.guess_extension(content_type)
+                temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
+                temp_file.write(r.content)
+                file_paths.append(temp_file.name)
+        docs = self._extract_text_from_pdfs(file_paths)
+        return docs
+    def _split_text_into_chunks(self, text: str) -> List[str]:
+        """Split text into smaller chunks.
+        Args:
+            text (str): Input text to be split.
+        Returns:
+            List[str]: List of smaller text chunks.
+        """
+        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
+        chunks = text_splitter.split_documents(text)
+        return chunks
+    def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
+        """Create a vector store from text chunks.
         Args:
+            text_chunks (List[str]): List of text chunks.
         Returns:
+            FAISS: Vector store created from the text chunks.
         """
+        embeddings = OpenAIEmbeddings()
+        return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
+    def _create_conversation_chain(self,vectorstore):
+        _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
+        Chat History:  {chat_history}
+        Follow Up Input: {question}
+        Standalone question:"""
+        CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+        memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+        llm = ChatOpenAI(temperature=0)
+        return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
+                                                     condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+                                                     memory=memory)
+    def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
+        """Build knowledge base from uploaded files.
+        Args:
+            file_paths (List[str]): List of file paths.
+        Returns:
+            Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
+        """
+        file_path = file_paths[0].name
+        file_extension = os.path.splitext(file_path)[1]
+        if file_extension == '.pdf':
+            pdf_docs = [file_path.name for file_path in file_paths]
+            raw_text = self._extract_text_from_pdfs(pdf_docs)
+            text_chunks = self._split_text_into_chunks(raw_text)
+            vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
+            return "file uploaded", {"knowledge_base": vectorstore}
+        elif file_extension == '.csv':
+            df = pd.read_csv(file_path)
+            pd_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)
+            tools = self.get_agent_tools(pd_agent)
+            memory,tools,prompt = self.create_memory_for_csv_qa(tools)
+            agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
+            return "file uploaded", {"knowledge_base": agent_chain}
+        else:
+            return "file uploaded", ""
+    def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
+        """Build knowledge base from URLs.
+        Args:
+            urls (str): Comma-separated URLs.
+        Returns:
+            Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
+        """
+        webpage_text = self._get_content_from_url(urls)
+        text_chunks = self._split_text_into_chunks(webpage_text)
+        vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
+        return "file uploaded", {"knowledge_base": vectorstore}
+#************************
+#   csv qa
+#************************
+    def get_agent_tools(self,agent):
+      # search = agent
+      tools = [
+        Tool(
+            name="dataframe qa",
+            func=agent.run,
+            description="useful for when you need to answer questions about table data and dataframe data",
+        )
+      ]
+      return tools
+    def create_memory_for_csv_qa(self,tools):
+      prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
+      suffix = """Begin!"
+      {chat_history}
+      Question: {input}
+      {agent_scratchpad}"""
+      prompt = ZeroShotAgent.create_prompt(
+        tools,
+        prefix=prefix,
+        suffix=suffix,
+        input_variables=["input", "chat_history", "agent_scratchpad"],
+      )
+      memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
+      return memory,tools,prompt
+    def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
+        llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
+        agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
+        agent_chain = AgentExecutor.from_agent_and_tools(
+            agent=agent, tools=tools, verbose=True, memory=memory
+        )
+        return agent_chain
+    def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
+        """Get a response from the chatbot.
         Args:
+            message (str): User's message/question.
+            chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
+            state (dict): State containing the knowledge base.
         Returns:
+            Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
         """
+        try:
+          if file_paths:
+            file_path = file_paths[0].name
+            file_extension = os.path.splitext(file_path)[1]
+            if file_extension == ".pdf":
+                vectorstore = state["knowledge_base"]
+                chat = self._create_conversation_chain(vectorstore)
+                # user_ques = {"question": message}
+                print("chat_history",chat_history)
+                response = chat({"question": message,"chat_history": chat_history})
+                chat_history.append((message, response["answer"]))
+                return "", chat_history
+            elif file_extension == '.csv':
+                agent_chain = state["knowledge_base"]
+                response = agent_chain.run(input = message)
+                chat_history.append((message, response))
+                return "", chat_history
+          else:
+              vectorstore = state["knowledge_base"]
+              chat = self._create_conversation_chain(vectorstore)
+              # user_ques = {"question": message}
+              print("chat_history",chat_history)
+              response = chat({"question": message,"chat_history": chat_history})
+              chat_history.append((message, response["answer"]))
+              return "", chat_history
+        except:
+            chat_history.append((message, "Please Upload Document or URL"))
+            return "", chat_history
+    def gradio_interface(self) -> None:
+        """Create a Gradio interface for the chatbot."""
         with gr.Blocks(css="style.css",theme='karthikeyan-adople/hudsonhayes-gray') as demo:
+            gr.HTML("""<center class="darkblue" style='background-color:rgb(0,1,36); text-align:center;padding:25px;'>
+            <center>
               <h1 class ="center">
                    <img src="file=logo.png" height="110px" width="280px">
               </h1>
+            </center>
+            <be>
+            <h1 style="color:#fff">
                Virtual Assistant Chatbot
+            </h1>
+            </center>""")
+            state = gr.State(self._get_empty_state())
+            chatbot = gr.Chatbot()
+            with gr.Row():
+                with gr.Column(scale=0.85):
+                    msg = gr.Textbox(label="Question")
+                with gr.Column(scale=0.15):
+                    file_output = gr.Textbox(label="File Status")
+            with gr.Row():
+                with gr.Column(scale=0.85):
+                    clear = gr.ClearButton([msg, chatbot])
+                with gr.Column(scale=0.15):
+                    upload_button = gr.UploadButton(
+                        "Browse File",
+                        file_types=[".txt", ".pdf", ".doc", ".docx"],
+                        file_count="multiple", variant="primary"
+                    )
+            with gr.Row():
+                with gr.Column(scale=1):
+                    input_url = gr.Textbox(label="urls")
+            input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
+            upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
+            msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
+        demo.launch()
+if __name__ == "__main__":
+    chatdocumentqa = ChatDocumentQA()
+    chatdocumentqa.gradio_interface()