capradeepgujaran
commited on
Commit
•
8c54584
1
Parent(s):
267c744
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,6 @@ import os
|
|
2 |
import tempfile
|
3 |
import gradio as gr
|
4 |
import fitz # PyMuPDF for reading PDF files
|
5 |
-
import pytesseract
|
6 |
from PIL import Image
|
7 |
import docx # for reading .docx files
|
8 |
from llama_index.core import VectorStoreIndex, Document
|
@@ -21,28 +20,17 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(
|
|
21 |
# Load environment variables from .env file
|
22 |
load_dotenv()
|
23 |
|
24 |
-
# Set the path for Tesseract OCR (ensure it's installed in your environment)
|
25 |
-
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # This is the typical path for Tesseract on Linux systems
|
26 |
-
|
27 |
# Initialize global variables
|
28 |
vector_index = None
|
29 |
query_log = [] # Store queries and results for logging purposes
|
30 |
|
31 |
-
# Function to handle PDF
|
32 |
def load_pdf_manually(pdf_path):
|
33 |
doc = fitz.open(pdf_path)
|
34 |
text = ""
|
35 |
for page_num in range(doc.page_count):
|
36 |
page = doc[page_num]
|
37 |
-
|
38 |
-
|
39 |
-
# If no text (i.e., scanned PDF), use OCR
|
40 |
-
if not page_text.strip():
|
41 |
-
pix = page.get_pixmap()
|
42 |
-
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
43 |
-
page_text = pytesseract.image_to_string(img)
|
44 |
-
|
45 |
-
text += page_text
|
46 |
return text
|
47 |
|
48 |
# Function to handle .docx files
|
@@ -223,7 +211,7 @@ def main():
|
|
223 |
**Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
|
224 |
""")
|
225 |
|
226 |
-
demo.launch()
|
227 |
|
228 |
if __name__ == "__main__":
|
229 |
-
main()
|
|
|
2 |
import tempfile
|
3 |
import gradio as gr
|
4 |
import fitz # PyMuPDF for reading PDF files
|
|
|
5 |
from PIL import Image
|
6 |
import docx # for reading .docx files
|
7 |
from llama_index.core import VectorStoreIndex, Document
|
|
|
20 |
# Load environment variables from .env file
|
21 |
load_dotenv()
|
22 |
|
|
|
|
|
|
|
23 |
# Initialize global variables
|
24 |
vector_index = None
|
25 |
query_log = [] # Store queries and results for logging purposes
|
26 |
|
27 |
+
# Function to handle PDF (without OCR for scanned PDFs)
|
28 |
def load_pdf_manually(pdf_path):
|
29 |
doc = fitz.open(pdf_path)
|
30 |
text = ""
|
31 |
for page_num in range(doc.page_count):
|
32 |
page = doc[page_num]
|
33 |
+
text += page.get_text()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
return text
|
35 |
|
36 |
# Function to handle .docx files
|
|
|
211 |
**Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
|
212 |
""")
|
213 |
|
214 |
+
demo.launch()
|
215 |
|
216 |
if __name__ == "__main__":
|
217 |
+
main()
|