capradeepgujaran commited on
Commit
2ad74db
1 Parent(s): a7d59e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -79
app.py CHANGED
@@ -1,20 +1,18 @@
1
  import os
2
- import tempfile
3
- import logging
 
 
4
  import gradio as gr
5
- import PyPDF2
6
  from pdf2image import convert_from_path
7
- import pytesseract
8
- from PIL import Image
9
- import docx
10
  from llama_index.core import VectorStoreIndex, Document
11
  from llama_index.embeddings.openai import OpenAIEmbedding
12
  from llama_index.llms.openai import OpenAI
13
  from llama_index.core import get_response_synthesizer
14
  from dotenv import load_dotenv
15
  from sentence_transformers import SentenceTransformer, util
16
- from sklearn.metrics.pairwise import cosine_similarity
17
- import numpy as np
18
 
19
  # Set up logging configuration
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
@@ -22,79 +20,63 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(
22
  # Load environment variables from .env file
23
  load_dotenv()
24
 
25
- # Tesseract language options
26
- langs = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
27
-
28
  # Initialize global variables
29
  vector_index = None
30
  query_log = []
31
  sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
32
 
33
- def extract_text_from_pdf(pdf_path, lang=None):
34
- text = ""
35
- image_count = 0
36
- total_pages = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
38
  try:
39
  with open(pdf_path, 'rb') as file:
40
  pdf_reader = PyPDF2.PdfReader(file)
41
- total_pages = len(pdf_reader.pages)
42
-
43
- for page_num, page in enumerate(pdf_reader.pages, 1):
44
  page_text = page.extract_text()
45
-
46
- # If text is not found, consider the page as an image and use OCR
47
- if not page_text.strip():
48
- images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
49
- for image in images:
50
- ocr_text = pytesseract.image_to_string(image, lang=None if lang == [] else lang)
51
- text += ocr_text
52
- image_count += 1
53
- text += f"\n[OCR applied on image detected on page {page_num}]\n"
54
- else:
55
  text += page_text
 
 
 
 
 
 
56
  except Exception as e:
57
- logging.error(f"Error processing PDF {pdf_path}: {str(e)}")
58
- return f"[Error processing PDF: {str(e)}]\n"
59
-
60
- if image_count == total_pages:
61
- summary = f"This document consists of {total_pages} page(s) of images.\n"
62
- summary += "No text could be extracted directly. OCR was applied to images.\n"
63
- summary += f"File path: {pdf_path}\n"
64
- return summary
65
- elif image_count > 0:
66
- text = f"This document contains both text and images.\n" + \
67
- f"Total pages: {total_pages}\n" + \
68
- f"Pages with images: {image_count}\n" + \
69
- f"Extracted text (including OCR):\n\n" + text
70
-
71
  return text
72
 
73
- def load_docx_file(docx_path):
74
- try:
75
- doc = docx.Document(docx_path)
76
- return '\n'.join([para.text for para in doc.paragraphs])
77
- except Exception as e:
78
- logging.error(f"Error processing DOCX {docx_path}: {str(e)}")
79
- return f"[Error processing DOCX: {str(e)}]\n"
80
-
81
- def load_txt_file(txt_path):
82
- try:
83
- with open(txt_path, 'r', encoding='utf-8') as f:
84
- return f.read()
85
- except Exception as e:
86
- logging.error(f"Error processing TXT {txt_path}: {str(e)}")
87
- return f"[Error processing TXT: {str(e)}]\n"
88
-
89
- def load_file_based_on_extension(file_path, lang=None):
90
- if file_path.lower().endswith('.pdf'):
91
  return extract_text_from_pdf(file_path, lang)
92
- elif file_path.lower().endswith('.docx'):
93
- return load_docx_file(file_path)
94
- elif file_path.lower().endswith('.txt'):
95
- return load_txt_file(file_path)
96
  else:
97
- raise ValueError(f"Unsupported file format: {file_path}")
98
 
99
  def process_upload(api_key, files, lang):
100
  global vector_index
@@ -111,7 +93,7 @@ def process_upload(api_key, files, lang):
111
 
112
  for file_path in files:
113
  try:
114
- text = load_file_based_on_extension(file_path, lang)
115
  if "This document consists of" in text and "page(s) of images" in text:
116
  image_heavy_docs.append(os.path.basename(file_path))
117
  documents.append(Document(text=text))
@@ -137,30 +119,18 @@ def process_upload(api_key, files, lang):
137
  else:
138
  return f"No valid documents were indexed. Errors: {'; '.join(error_messages)}", None
139
 
140
- # Define the calculate_similarity function
141
  def calculate_similarity(response, ground_truth):
142
- # Encode the response and ground truth
143
  response_embedding = sentence_model.encode(response, convert_to_tensor=True)
144
  truth_embedding = sentence_model.encode(ground_truth, convert_to_tensor=True)
145
 
146
- # Convert embeddings to numpy arrays for easier manipulation
147
- response_embedding = response_embedding.cpu().numpy()
148
- truth_embedding = truth_embedding.cpu().numpy()
149
-
150
- # Normalize the embeddings to unit vectors (magnitude of 1)
151
  response_embedding = response_embedding / np.linalg.norm(response_embedding)
152
  truth_embedding = truth_embedding / np.linalg.norm(truth_embedding)
153
 
154
- # Calculate cosine similarity using numpy's dot product
155
  similarity = np.dot(response_embedding, truth_embedding)
156
-
157
- # Return similarity as a percentage (between 0 and 100)
158
- similarity_percentage = (similarity + 1) / 2 * 100 # Normalize from [-1, 1] to [0, 100]
159
 
160
  return similarity_percentage
161
 
162
-
163
- # This is the missing query_app function that needs to be defined
164
  def query_app(query, model_name, use_similarity_check, openai_api_key):
165
  global vector_index, query_log
166
 
 
1
  import os
2
+ import cv2
3
+ import numpy as np
4
+ from PIL import Image
5
+ import pytesseract
6
  import gradio as gr
 
7
  from pdf2image import convert_from_path
8
+ import PyPDF2
 
 
9
  from llama_index.core import VectorStoreIndex, Document
10
  from llama_index.embeddings.openai import OpenAIEmbedding
11
  from llama_index.llms.openai import OpenAI
12
  from llama_index.core import get_response_synthesizer
13
  from dotenv import load_dotenv
14
  from sentence_transformers import SentenceTransformer, util
15
+ import logging
 
16
 
17
  # Set up logging configuration
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
 
20
  # Load environment variables from .env file
21
  load_dotenv()
22
 
 
 
 
23
  # Initialize global variables
24
  vector_index = None
25
  query_log = []
26
  sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
27
 
28
+ langs = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
29
+
30
+ def preprocess_image(image_path):
31
+ """
32
+ Pre-process the image to improve OCR results.
33
+ - Convert to grayscale
34
+ - Apply thresholding to improve contrast
35
+ - Apply denoising if needed
36
+ """
37
+ img = cv2.imread(image_path)
38
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
39
+ gray = cv2.equalizeHist(gray)
40
+ gray = cv2.GaussianBlur(gray, (5, 5), 0)
41
+ processed_image = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
42
+ cv2.THRESH_BINARY, 11, 2)
43
+ temp_filename = "processed_image.png"
44
+ cv2.imwrite(temp_filename, processed_image)
45
+ return temp_filename
46
+
47
+ def extract_text_from_image(image_path, lang='eng'):
48
+ processed_image_path = preprocess_image(image_path)
49
+ text = pytesseract.image_to_string(Image.open(processed_image_path), lang=lang)
50
+ return text
51
 
52
+ def extract_text_from_pdf(pdf_path, lang='eng'):
53
+ text = ""
54
  try:
55
  with open(pdf_path, 'rb') as file:
56
  pdf_reader = PyPDF2.PdfReader(file)
57
+ for page_num in range(len(pdf_reader.pages)):
58
+ page = pdf_reader.pages[page_num]
 
59
  page_text = page.extract_text()
60
+ if page_text.strip():
 
 
 
 
 
 
 
 
 
61
  text += page_text
62
+ else:
63
+ images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
64
+ for image in images:
65
+ image.save('temp_image.png', 'PNG')
66
+ text += extract_text_from_image('temp_image.png', lang=lang)
67
+ text += f"\n[OCR applied on page {page_num + 1}]\n"
68
  except Exception as e:
69
+ return f"Error processing PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return text
71
 
72
+ def extract_text(file_path, lang='eng'):
73
+ file_ext = file_path.lower().split('.')[-1]
74
+ if file_ext in ['pdf']:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return extract_text_from_pdf(file_path, lang)
76
+ elif file_ext in ['png', 'jpg', 'jpeg']:
77
+ return extract_text_from_image(file_path, lang)
 
 
78
  else:
79
+ return f"Unsupported file type: {file_ext}"
80
 
81
  def process_upload(api_key, files, lang):
82
  global vector_index
 
93
 
94
  for file_path in files:
95
  try:
96
+ text = extract_text(file_path, lang)
97
  if "This document consists of" in text and "page(s) of images" in text:
98
  image_heavy_docs.append(os.path.basename(file_path))
99
  documents.append(Document(text=text))
 
119
  else:
120
  return f"No valid documents were indexed. Errors: {'; '.join(error_messages)}", None
121
 
 
122
  def calculate_similarity(response, ground_truth):
 
123
  response_embedding = sentence_model.encode(response, convert_to_tensor=True)
124
  truth_embedding = sentence_model.encode(ground_truth, convert_to_tensor=True)
125
 
 
 
 
 
 
126
  response_embedding = response_embedding / np.linalg.norm(response_embedding)
127
  truth_embedding = truth_embedding / np.linalg.norm(truth_embedding)
128
 
 
129
  similarity = np.dot(response_embedding, truth_embedding)
130
+ similarity_percentage = (similarity + 1) / 2 * 100
 
 
131
 
132
  return similarity_percentage
133
 
 
 
134
  def query_app(query, model_name, use_similarity_check, openai_api_key):
135
  global vector_index, query_log
136