capradeepgujaran commited on
Commit
8c54584
1 Parent(s): 267c744

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -16
app.py CHANGED
@@ -2,7 +2,6 @@ import os
2
  import tempfile
3
  import gradio as gr
4
  import fitz # PyMuPDF for reading PDF files
5
- import pytesseract
6
  from PIL import Image
7
  import docx # for reading .docx files
8
  from llama_index.core import VectorStoreIndex, Document
@@ -21,28 +20,17 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(
21
  # Load environment variables from .env file
22
  load_dotenv()
23
 
24
- # Set the path for Tesseract OCR (ensure it's installed in your environment)
25
- pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # This is the typical path for Tesseract on Linux systems
26
-
27
  # Initialize global variables
28
  vector_index = None
29
  query_log = [] # Store queries and results for logging purposes
30
 
31
- # Function to handle PDF and OCR for scanned PDFs
32
  def load_pdf_manually(pdf_path):
33
  doc = fitz.open(pdf_path)
34
  text = ""
35
  for page_num in range(doc.page_count):
36
  page = doc[page_num]
37
- page_text = page.get_text()
38
-
39
- # If no text (i.e., scanned PDF), use OCR
40
- if not page_text.strip():
41
- pix = page.get_pixmap()
42
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
43
- page_text = pytesseract.image_to_string(img)
44
-
45
- text += page_text
46
  return text
47
 
48
  # Function to handle .docx files
@@ -223,7 +211,7 @@ def main():
223
  **Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
224
  """)
225
 
226
- demo.launch() # Removed share=True since it's not required on Hugging Face Spaces
227
 
228
  if __name__ == "__main__":
229
- main()
 
2
  import tempfile
3
  import gradio as gr
4
  import fitz # PyMuPDF for reading PDF files
 
5
  from PIL import Image
6
  import docx # for reading .docx files
7
  from llama_index.core import VectorStoreIndex, Document
 
20
  # Load environment variables from .env file
21
  load_dotenv()
22
 
 
 
 
23
  # Initialize global variables
24
  vector_index = None
25
  query_log = [] # Store queries and results for logging purposes
26
 
27
+ # Function to handle PDF (without OCR for scanned PDFs)
28
  def load_pdf_manually(pdf_path):
29
  doc = fitz.open(pdf_path)
30
  text = ""
31
  for page_num in range(doc.page_count):
32
  page = doc[page_num]
33
+ text += page.get_text()
 
 
 
 
 
 
 
 
34
  return text
35
 
36
  # Function to handle .docx files
 
211
  **Note:** Ensure you upload documents before attempting to query. Enter a valid OpenAI API Key to interact with the models.
212
  """)
213
 
214
+ demo.launch()
215
 
216
  if __name__ == "__main__":
217
+ main()