ashok2216 commited on
Commit
0dcfd6e
·
verified ·
1 Parent(s): 765b4cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -35
app.py CHANGED
@@ -1,46 +1,138 @@
1
- import tempfile
2
- import PyPDF2
 
 
3
  import streamlit as st
4
- from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
 
6
- # Load pre-trained GPT-3 model and tokenizer
7
- tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
8
- model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
 
 
 
 
 
 
 
 
 
 
9
 
10
-
11
- def extract_text_from_pdf(file_path):
12
- text = ""
13
- with open(file_path, "rb") as f:
14
- reader = PyPDF2.PdfFileReader(f)
15
- for page_num in range(reader.numPages):
16
- text += reader.getPage(page_num).extractText()
17
- return text
18
-
19
- def generate_response(user_input):
20
- input_ids = tokenizer.encode(user_input, return_tensors="pt")
21
- output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
22
- response = tokenizer.decode(output[0], skip_special_tokens=True)
23
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
25
  def main():
26
- st.title("PDF Chatbot")
 
27
 
28
- pdf_file = st.file_uploader("Upload an pdf file", type=["pdf"], accept_multiple_files=False)
 
 
29
 
30
- if pdf_file is not None:
31
- with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
32
- tmp_file.write(pdf_file.read())
33
- st.success("PDF file successfully uploaded and stored temporally.")
34
- file_path = tmp_file.name
35
- pdf_text = extract_text_from_pdf(file_path)
36
- st.text_area("PDF Content", pdf_text)
37
- else:
38
- st.markdown('File not found!')
39
 
40
- user_input = st.text_input("You:", "")
41
- if st.button("Send"):
42
- response = generate_response(user_input)
43
- st.text_area("Chatbot:", response)
 
 
 
 
 
 
 
 
44
 
45
  if __name__ == "__main__":
46
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from chromadb.utils import embedding_functions
3
+ from sentence_transformers import SentenceTransformer
4
+ from transformers import pipeline
5
  import streamlit as st
6
+ import fitz # PyMuPDF for PDF parsing
7
 
8
+ # Step 1: Setup ChromaDB
9
+ def setup_chromadb():
10
+ # Initialize ChromaDB in-memory instance
11
+ client = chromadb.Client()
12
+ try:
13
+ client.delete_collection("pdf_data")
14
+ print("Existing collection 'pdf_data' deleted.")
15
+ except:
16
+ print("Collection 'pdf_data' not found, creating a new one.")
17
+ # Create a new collection with the embedding function
18
+ ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
19
+ collection = client.create_collection("pdf_data", embedding_function=ef)
20
+ return client, collection
21
 
22
+ # Step 2: Extract Text from PDF
23
+ def extract_text_from_pdf(pdf_path):
24
+ pdf_text = ""
25
+ with fitz.open(pdf_path) as doc:
26
+ for page in doc:
27
+ pdf_text += page.get_text()
28
+ return pdf_text
29
+
30
+ # Step 3: Add Extracted Text to Vector Database
31
+ def add_pdf_text_to_db(collection, pdf_text):
32
+ sentences = pdf_text.split("\n") # Split text into lines for granularity
33
+ for idx, sentence in enumerate(sentences):
34
+ if sentence.strip(): # Avoid empty lines
35
+ collection.add(
36
+ ids=[f"pdf_text_{idx}"],
37
+ documents=[sentence],
38
+ metadatas={"line_number": idx, "text": sentence}
39
+ )
40
+
41
+ # Step 4: Query Function
42
+ def query_pdf_data(collection, query, retriever_model):
43
+ results = collection.query(
44
+ query_texts=[query],
45
+ n_results=3
46
+ )
47
+ context = " ".join([doc for doc in results["documents"][0]])
48
+ answer = retriever_model(f"Context: {context}\nQuestion: {query}")
49
+ return answer, results["metadatas"]
50
 
51
+ # Streamlit Interface
52
  def main():
53
+ st.title("PDF Chatbot with Retrieval-Augmented Generation")
54
+ st.write("Upload a PDF, and ask questions about its content!")
55
 
56
+ # Initialize components
57
+ client, collection = setup_chromadb()
58
+ retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
59
 
60
+ # File upload
61
+ uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
62
+ if uploaded_file:
63
+ st.write("Extracting text and populating the database...")
64
+ pdf_text = extract_text_from_pdf(uploaded_file)
65
+ add_pdf_text_to_db(collection, pdf_text)
66
+ st.success("PDF text has been added to the database. You can now query it!")
 
 
67
 
68
+ # Query Input
69
+ query = st.text_input("Enter your query about the PDF:")
70
+ if query:
71
+ try:
72
+ answer, metadata = query_pdf_data(collection, query, retriever_model)
73
+ st.subheader("Answer:")
74
+ st.write(answer[0]['generated_text'])
75
+ st.subheader("Retrieved Context:")
76
+ for meta in metadata[0]:
77
+ st.write(meta)
78
+ except Exception as e:
79
+ st.error(f"An error occurred: {str(e)}")
80
 
81
  if __name__ == "__main__":
82
  main()
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ # import tempfile
94
+ # import PyPDF2
95
+ # import streamlit as st
96
+ # from transformers import GPT2LMHeadModel, GPT2Tokenizer
97
+
98
+ # # Load pre-trained GPT-3 model and tokenizer
99
+ # tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
100
+ # model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
101
+
102
+
103
+ # def extract_text_from_pdf(file_path):
104
+ # text = ""
105
+ # with open(file_path, "rb") as f:
106
+ # reader = PyPDF2.PdfFileReader(f)
107
+ # for page_num in range(reader.numPages):
108
+ # text += reader.getPage(page_num).extractText()
109
+ # return text
110
+
111
+ # def generate_response(user_input):
112
+ # input_ids = tokenizer.encode(user_input, return_tensors="pt")
113
+ # output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
114
+ # response = tokenizer.decode(output[0], skip_special_tokens=True)
115
+ # return response
116
+
117
+ # def main():
118
+ # st.title("PDF Chatbot")
119
+
120
+ # pdf_file = st.file_uploader("Upload an pdf file", type=["pdf"], accept_multiple_files=False)
121
+
122
+ # if pdf_file is not None:
123
+ # with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
124
+ # tmp_file.write(pdf_file.read())
125
+ # st.success("PDF file successfully uploaded and stored temporally.")
126
+ # file_path = tmp_file.name
127
+ # pdf_text = extract_text_from_pdf(file_path)
128
+ # st.text_area("PDF Content", pdf_text)
129
+ # else:
130
+ # st.markdown('File not found!')
131
+
132
+ # user_input = st.text_input("You:", "")
133
+ # if st.button("Send"):
134
+ # response = generate_response(user_input)
135
+ # st.text_area("Chatbot:", response)
136
+
137
+ # if __name__ == "__main__":
138
+ # main()