Spaces:

capradeepgujaran
/

ChatWithDocuments

Running

App Files Files Community

capradeepgujaran commited on 5 days ago

Commit

8c54584

•

1 Parent(s): 267c744

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -16

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import tempfile
 import gradio as gr
 import fitz  # PyMuPDF for reading PDF files
-import pytesseract
 from PIL import Image
 import docx  # for reading .docx files
 from llama_index.core import VectorStoreIndex, Document
@@ -21,28 +20,17 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(
 # Load environment variables from .env file
 load_dotenv()
-# Set the path for Tesseract OCR (ensure it's installed in your environment)
-pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # This is the typical path for Tesseract on Linux systems
 # Initialize global variables
 vector_index = None
 query_log = []  # Store queries and results for logging purposes
-# Function to handle PDF and OCR for scanned PDFs
 def load_pdf_manually(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page_num in range(doc.page_count):
         page = doc[page_num]
-        page_text = page.get_text()
-        # If no text (i.e., scanned PDF), use OCR
-        if not page_text.strip():
-            pix = page.get_pixmap()
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            page_text = pytesseract.image_to_string(img)
-        text += page_text
     return text
 # Function to handle .docx files
@@ -223,7 +211,7 @@ def main():
         **Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
         """)
-    demo.launch()  # Removed share=True since it's not required on Hugging Face Spaces
 if __name__ == "__main__":
-    main()

 import tempfile
 import gradio as gr
 import fitz  # PyMuPDF for reading PDF files
 from PIL import Image
 import docx  # for reading .docx files
 from llama_index.core import VectorStoreIndex, Document
 # Load environment variables from .env file
 load_dotenv()
 # Initialize global variables
 vector_index = None
 query_log = []  # Store queries and results for logging purposes
+# Function to handle PDF (without OCR for scanned PDFs)
 def load_pdf_manually(pdf_path):
     doc = fitz.open(pdf_path)
     text = ""
     for page_num in range(doc.page_count):
         page = doc[page_num]
+        text += page.get_text()
     return text
 # Function to handle .docx files
         **Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
         """)
+    demo.launch()
 if __name__ == "__main__":
+    main()