Yasir646 commited on
Commit
8f12416
·
verified ·
1 Parent(s): 10212ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -51
app.py CHANGED
@@ -1,60 +1,93 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfReader
3
- import openai
 
 
 
 
4
  import os
5
 
6
- # Set up OpenAI API key
7
- openai.api_key = "your_openai_api_key"
8
-
9
- # Process PDF file to extract text and chunk it into manageable parts
10
- def process_pdf(uploaded_pdf):
11
- # Read the uploaded PDF
12
- pdf_reader = PdfReader(uploaded_pdf)
13
- text = ''
14
- for page_num in range(len(pdf_reader.pages)):
15
- text += pdf_reader.pages[page_num].extract_text()
16
-
17
- # Split text into chunks (a chunk of 1500 characters as an example)
18
- chunks = [text[i:i+1500] for i in range(0, len(text), 1500)]
 
 
 
 
 
 
 
19
  return chunks
20
 
21
- # Query OpenAI GPT model
22
- def query_openai_gpt(context, user_query):
23
- response = openai.ChatCompletion.create(
24
- model="gpt-4",
25
- messages=[
26
- {"role": "system", "content": "You are a helpful assistant answering questions based on provided context."},
27
- {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"}
28
- ]
29
- )
30
- return response["choices"][0]["message"]["content"]
31
-
32
- # Main Streamlit UI
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def main():
34
- st.title("PDF Query Application with OpenAI GPT")
35
-
36
- uploaded_pdf = st.file_uploader("Upload your PDF", type="pdf")
37
-
38
- if uploaded_pdf is not None:
39
- st.write("PDF uploaded successfully! Processing...")
40
-
41
- # Process the PDF and extract chunks
42
- chunks = process_pdf(uploaded_pdf)
43
- st.write(f"Extracted {len(chunks)} chunks from the PDF.")
44
-
45
- st.write("Ready to answer questions based on the PDF!")
46
-
47
- # User query
48
- user_query = st.text_input("Ask a question about the PDF:")
49
-
50
- if user_query:
51
- # Combine chunks into context (or use retrieval techniques if needed)
52
- context = " ".join(chunks[:5]) # Use the first 5 chunks for simplicity
53
- st.write("Processing your question with OpenAI GPT...")
54
-
55
- # Query OpenAI GPT
56
- response = query_openai_gpt(context, user_query)
57
- st.write(f"Answer: {response}")
 
 
 
 
 
 
 
 
 
58
 
59
  if __name__ == "__main__":
60
  main()
 
1
  import streamlit as st
2
+ import PyPDF2
3
+ import faiss
4
+ import numpy as np
5
+ from transformers import pipeline, AutoTokenizer, AutoModel
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
  import os
9
 
10
+ # Load the pre-trained model for embeddings (Sentence-Transformers)
11
+ embedder = SentenceTransformer('all-MiniLM-L6-v2') # Open-source model from Hugging Face
12
+
13
+ # Load the language generation model (GPT-2)
14
+ generator = pipeline("text-generation", model="gpt2")
15
+
16
+ # Function to extract text from PDF
17
+ def extract_pdf_text(pdf_file):
18
+ reader = PyPDF2.PdfReader(pdf_file)
19
+ text = ""
20
+ for page in range(len(reader.pages)):
21
+ text += reader.pages[page].extract_text()
22
+ return text
23
+
24
+ # Function to chunk text into smaller pieces
25
+ def chunk_text(text, chunk_size=500):
26
+ chunks = []
27
+ words = text.split()
28
+ for i in range(0, len(words), chunk_size):
29
+ chunks.append(' '.join(words[i:i + chunk_size]))
30
  return chunks
31
 
32
+ # Function to generate embeddings for chunks
33
+ def generate_embeddings(chunks):
34
+ embeddings = embedder.encode(chunks, convert_to_tensor=True)
35
+ return embeddings
36
+
37
+ # FAISS Index Setup
38
+ def create_faiss_index(embeddings):
39
+ d = embeddings.shape[1] # Dimensionality of the embeddings
40
+ index = faiss.IndexFlatL2(d) # L2 distance metric
41
+ index.add(np.array(embeddings)) # Add embeddings to the index
42
+ return index
43
+
44
+ # Function to find the most relevant chunk based on the query
45
+ def find_relevant_chunk(query, index, chunks):
46
+ query_embedding = embedder.encode([query])[0] # Get the query embedding
47
+ _, indices = index.search(np.array([query_embedding]), k=1) # Find nearest chunk
48
+ return chunks[indices[0][0]]
49
+
50
+ # Function to generate a response using GPT-2 and the relevant chunk
51
+ def generate_answer(query, relevant_chunk):
52
+ context = relevant_chunk + "\n\n" + query # Use chunk as context for answering
53
+ answer = generator(context, max_length=150, num_return_sequences=1)
54
+ return answer[0]['generated_text']
55
+
56
+ # Streamlit App Interface
57
  def main():
58
+ st.title("PDF Q&A with RAG System")
59
+
60
+ # File upload
61
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
62
+ if uploaded_file is not None:
63
+ st.write("Processing the PDF...")
64
+
65
+ # Extract text from the uploaded PDF
66
+ pdf_text = extract_pdf_text(uploaded_file)
67
+ st.write("Text extracted from the PDF:")
68
+ st.text_area("Extracted Text", pdf_text[:500], height=200)
69
+
70
+ # Chunk the extracted text
71
+ chunks = chunk_text(pdf_text)
72
+ embeddings = generate_embeddings(chunks)
73
+
74
+ # Create FAISS index
75
+ index = create_faiss_index(embeddings)
76
+
77
+ st.write("PDF is processed, you can now ask questions.")
78
+
79
+ # User query input
80
+ query = st.text_input("Ask a question about the document:")
81
+
82
+ if query:
83
+ # Find the most relevant chunk
84
+ relevant_chunk = find_relevant_chunk(query, index, chunks)
85
+ st.write("Relevant chunk found:")
86
+ st.text_area("Relevant Chunk", relevant_chunk, height=200)
87
+
88
+ # Generate an answer
89
+ answer = generate_answer(query, relevant_chunk)
90
+ st.write("Answer:", answer)
91
 
92
  if __name__ == "__main__":
93
  main()