Nikhil0987 commited on
Commit
c41be8b
1 Parent(s): 78b1cb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -207
app.py CHANGED
@@ -1,244 +1,106 @@
1
  import streamlit as st
2
- import langchain
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.embeddings.openai import OpenAIEmbeddings
5
- from langchain.vectorstores import Chroma
6
- from langchain import OpenAI, VectorDBQA
7
- from langchain.chains import RetrievalQAWithSourcesChain
8
- import PyPDF2
9
  import os
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- api_key = os.environ["OPENAI_API_KEY"]
12
-
13
- #This function will go through pdf and extract and return list of page texts.
14
- def read_and_textify(files):
15
- text_list = []
16
- sources_list = []
17
- for file in files:
18
- pdfReader = PyPDF2.PdfReader(file)
19
- #print("Page Number:", len(pdfReader.pages))
20
- for i in range(len(pdfReader.pages)):
21
- pageObj = pdfReader.pages[i]
22
- text = pageObj.extract_text()
23
- pageObj.clear()
24
- text_list.append(text)
25
- sources_list.append(file.name + "_page_"+str(i))
26
- return [text_list,sources_list]
27
-
28
- st.set_page_config(layout="centered", page_title="Multidoc_QnA")
29
- st.header("Multidoc_QnA")
30
- st.write("---")
31
-
32
- #file uploader
33
- uploaded_files = st.file_uploader("Upload documents",accept_multiple_files=True, type=["txt","pdf"])
34
- st.write("---")
35
-
36
- if uploaded_files is None:
37
- st.info(f"""Upload files to analyse""")
38
- elif uploaded_files:
39
- st.write(str(len(uploaded_files)) + " document(s) loaded..")
40
-
41
- textify_output = read_and_textify(uploaded_files)
42
-
43
- documents = textify_output[0]
44
- sources = textify_output[1]
45
-
46
- #extract embeddings
47
- embeddings = OpenAIEmbeddings(openai_api_key = api_key)
48
- #vstore with metadata. Here we will store page numbers.
49
- vStore = Chroma.from_texts(documents, embeddings, metadatas=[{"source": s} for s in sources])
50
- #deciding model
51
- model_name = "gpt-3.5-turbo"
52
- # model_name = "gpt-4"
53
-
54
- retriever = vStore.as_retriever()
55
- retriever.search_kwargs = {'k':2}
56
-
57
- #initiate model
58
- llm = OpenAI(model_name=model_name, openai_api_key = api_key, streaming=True)
59
- model = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
60
-
61
- st.header("Ask your data")
62
- user_q = st.text_area("Enter your questions here")
63
-
64
- if st.button("Get Response"):
65
- try:
66
- with st.spinner("Model is working on it..."):
67
- result = model({"question":user_q}, return_only_outputs=True)
68
- st.subheader('Your response:')
69
- st.write(result['answer'])
70
- st.subheader('Source pages:')
71
- st.write(result['sources'])
72
- except Exception as e:
73
- st.error(f"An error occurred: {e}")
74
- st.error('Oops, the GPT response resulted in an error :( Please try again with a different question.')
75
-
76
-
77
-
78
-
79
-
80
-
81
-
82
-
83
-
84
-
85
-
86
-
87
 
88
 
89
 
90
 
91
 
 
 
 
 
 
 
 
92
 
93
 
94
 
 
 
 
 
95
 
96
 
 
 
 
 
97
 
98
 
 
99
 
 
 
 
 
 
100
 
 
 
101
 
 
 
102
 
 
 
103
 
 
104
 
105
 
106
 
 
 
 
 
 
107
 
 
108
 
 
 
 
 
109
 
 
 
110
 
111
- # import gradio as gr
112
- # import streamlit as st
113
- # from langchain.embeddings.openai import OpenAIEmbeddings
114
- # from langchain.text_splitter import CharacterTextSplitter
115
- # from langchain.vectorstores import Chroma
116
- # from langchain.chains import ConversationalRetrievalChain
117
- # from langchain.chat_models import ChatOpenAI
118
- # from langchain.document_loaders import PyPDFLoader
119
- # import os
120
- # import fitz
121
- # from PIL import Image
122
 
123
 
124
- # # Global variables
125
- # COUNT, N = 0, 0
126
- # chat_history = []
127
- # chain = None # Initialize chain as None
128
 
129
- # # Function to set the OpenAI API key
 
 
130
 
131
- # api_key = os.environ['OPENAI_API_KEY']
132
 
133
- # st.write(api_key)
 
134
 
135
-
136
- # # Function to enable the API key input box
137
- # def enable_api_box():
138
- # return enable_box
139
-
140
- # # Function to add text to the chat history
141
- # def add_text(history, text):
142
- # if not text:
143
- # raise gr.Error('Enter text')
144
- # history = history + [(text, '')]
145
- # return history
146
-
147
- # # Function to process the PDF file and create a conversation chain
148
- # def process_file(file):
149
- # global chain
150
- # if 'OPENAI_API_KEY' not in os.environ:
151
- # raise gr.Error('Upload your OpenAI API key')
152
-
153
- # # Replace with your actual PDF processing logic
154
- # loader = PyPDFLoader(file.name)
155
- # documents = loader.load()
156
- # embeddings = OpenAIEmbeddings()
157
- # pdfsearch = Chroma.from_documents(documents, embeddings)
158
-
159
- # chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0.3),
160
- # retriever=pdfsearch.as_retriever(search_kwargs={"k": 1}),
161
- # return_source_documents=True)
162
- # return chain
163
-
164
- # # Function to generate a response based on the chat history and query
165
- # def generate_response(history, query, pdf_upload):
166
- # global COUNT, N, chat_history, chain
167
- # if not pdf_upload:
168
- # raise gr.Error(message='Upload a PDF')
169
-
170
- # if COUNT == 0:
171
- # chain = process_file(pdf_upload)
172
- # COUNT += 1
173
-
174
- # # Replace with your LangChain logic to generate a response
175
- # result = chain({"question": query, 'chat_history': chat_history}, return_only_outputs=True)
176
- # chat_history += [(query, result["answer"])]
177
- # N = list(result['source_documents'][0])[1][1]['page'] # Adjust as needed
178
-
179
- # for char in result['answer']:
180
- # history[-1][-1] += char
181
- # return history, ''
182
-
183
- # # Function to render a specific page of a PDF file as an image
184
- # def render_file(file):
185
- # global N
186
- # doc = fitz.open(file.name)
187
- # page = doc[N]
188
- # pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
189
- # image = Image.frombytes('RGB', [pix.width, pix.height], pix.samples)
190
- # return image
191
-
192
- # # Function to render initial content from the PDF
193
- # def render_first(pdf_file):
194
- # # Replace with logic to process the PDF and generate an initial image
195
- # image = Image.new('RGB', (600, 400), color = 'white') # Placeholder
196
- # return image
197
-
198
- # # Streamlit & Gradio Interface
199
-
200
- # st.title("PDF-Powered Chatbot")
201
-
202
- # with st.container():
203
- # gr.Markdown("""
204
- # <style>
205
- # .image-container { height: 680px; }
206
- # </style>
207
- # """)
208
-
209
- # with gr.Blocks() as demo:
210
- # pdf_upload1 = gr.UploadButton("📁 Upload PDF 1", file_types=[".pdf"]) # Define pdf_upload1
211
-
212
- # # ... (rest of your interface creation)
213
-
214
- # txt = gr.Textbox(label="Enter your query", placeholder="Ask a question...")
215
- # submit_btn = gr.Button('Submit')
216
-
217
- # @submit_btn.click()
218
- # def on_submit():
219
- # add_text(chatbot, txt)
220
- # generate_response(chatbot, txt, pdf_upload1) # Use pdf_upload1 here
221
- # render_file(pdf_upload1) # Use pdf_upload1 here
222
-
223
- # if __name__ == "__main__":
224
- # gr.Interface(
225
- # fn=generate_response,
226
- # inputs=[
227
- # "file", # Define pdf_upload1
228
- # "text", # Define chatbot output
229
- # "text" # Define txt
230
- # ],
231
- # outputs=[
232
- # "image", # Define show_img
233
- # "text", # Define chatbot output
234
- # "text" # Define txt
235
- # ],
236
- # title="PDF-Powered Chatbot"
237
- # ).launch()
238
-
239
-
240
-
241
 
242
 
243
 
244
-
 
 
1
  import streamlit as st
2
+ from PyPDF2 import PdfReader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
4
  import os
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ import google.generativeai as genai
7
+ from langchain.vectorstores import FAISS
8
+ from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from langchain.chains.question_answering import load_qa_chain
10
+ from langchain.prompts import PromptTemplate
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+ os.getenv("GOOGLE_API_KEY")
15
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
 
20
 
21
 
22
+ def get_pdf_text(pdf_docs):
23
+ text=""
24
+ for pdf in pdf_docs:
25
+ pdf_reader= PdfReader(pdf)
26
+ for page in pdf_reader.pages:
27
+ text+= page.extract_text()
28
+ return text
29
 
30
 
31
 
32
+ def get_text_chunks(text):
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
34
+ chunks = text_splitter.split_text(text)
35
+ return chunks
36
 
37
 
38
+ def get_vector_store(text_chunks):
39
+ embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
40
+ vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
41
+ vector_store.save_local("faiss_index")
42
 
43
 
44
+ def get_conversational_chain():
45
 
46
+ prompt_template = """
47
+ Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
48
+ provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
49
+ Context:\n {context}?\n
50
+ Question: \n{question}\n
51
 
52
+ Answer:
53
+ """
54
 
55
+ model = ChatGoogleGenerativeAI(model="gemini-pro",
56
+ temperature=0.3)
57
 
58
+ prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
59
+ chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
60
 
61
+ return chain
62
 
63
 
64
 
65
+ def user_input(user_question):
66
+ embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
67
+
68
+ new_db = FAISS.load_local("faiss_index", embeddings)
69
+ docs = new_db.similarity_search(user_question)
70
 
71
+ chain = get_conversational_chain()
72
 
73
+
74
+ response = chain(
75
+ {"input_documents":docs, "question": user_question}
76
+ , return_only_outputs=True)
77
 
78
+ print(response)
79
+ st.write("Reply: ", response["output_text"])
80
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
 
 
 
 
83
 
84
+ def main():
85
+ st.set_page_config("Chat PDF")
86
+ st.header("Chat with PDF using Gemini💁")
87
 
88
+ user_question = st.text_input("Ask a Question from the PDF Files")
89
 
90
+ if user_question:
91
+ user_input(user_question)
92
 
93
+ with st.sidebar:
94
+ st.title("Menu:")
95
+ pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
96
+ if st.button("Submit & Process"):
97
+ with st.spinner("Processing..."):
98
+ raw_text = get_pdf_text(pdf_docs)
99
+ text_chunks = get_text_chunks(raw_text)
100
+ get_vector_store(text_chunks)
101
+ st.success("Done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
 
104
 
105
+ if __name__ == "__main__":
106
+ main()