capradeepgujaran commited on
Commit
afd4764
1 Parent(s): 4edc165

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -21
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import tempfile
 
3
  import gradio as gr
4
  import PyPDF2
5
  from pdf2image import convert_from_path
@@ -10,7 +11,6 @@ from llama_index.llms.openai import OpenAI
10
  from llama_index.core import get_response_synthesizer
11
  from dotenv import load_dotenv
12
  from sentence_transformers import SentenceTransformer, util
13
- import logging
14
 
15
  # Set up logging configuration
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
@@ -25,26 +25,38 @@ sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
25
 
26
  def extract_text_from_pdf(pdf_path):
27
  text = ""
28
- with open(pdf_path, 'rb') as file:
29
- pdf_reader = PyPDF2.PdfReader(file)
30
- for page in pdf_reader.pages:
31
- page_text = page.extract_text()
32
- if page_text.strip():
33
- text += page_text
34
- else:
35
- # If text extraction fails, convert the page to an image
36
- images = convert_from_path(pdf_path, first_page=pdf_reader.pages.index(page) + 1, last_page=pdf_reader.pages.index(page) + 1)
37
- if images:
38
- text += f"[Image on page {pdf_reader.pages.index(page) + 1}]\n"
 
 
 
 
39
  return text
40
 
41
  def load_docx_file(docx_path):
42
- doc = docx.Document(docx_path)
43
- return '\n'.join([para.text for para in doc.paragraphs])
 
 
 
 
44
 
45
  def load_txt_file(txt_path):
46
- with open(txt_path, 'r', encoding='utf-8') as f:
47
- return f.read()
 
 
 
 
48
 
49
  def load_file_based_on_extension(file_path):
50
  if file_path.lower().endswith('.pdf'):
@@ -66,19 +78,28 @@ def process_upload(api_key, files):
66
  return "No files uploaded.", None
67
 
68
  documents = []
 
69
  for file_path in files:
70
  try:
71
  text = load_file_based_on_extension(file_path)
72
  documents.append(Document(text=text))
73
  except Exception as e:
74
- return f"Error processing file {file_path}: {str(e)}", None
 
 
75
 
76
  if documents:
77
- embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=api_key)
78
- vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
79
- return f"Successfully indexed {len(documents)} files.", vector_index
 
 
 
 
 
 
80
  else:
81
- return "No valid documents were indexed.", None
82
 
83
  def calculate_similarity(response, ground_truth):
84
  response_embedding = sentence_model.encode(response, convert_to_tensor=True)
 
1
  import os
2
  import tempfile
3
+ import logging
4
  import gradio as gr
5
  import PyPDF2
6
  from pdf2image import convert_from_path
 
11
  from llama_index.core import get_response_synthesizer
12
  from dotenv import load_dotenv
13
  from sentence_transformers import SentenceTransformer, util
 
14
 
15
  # Set up logging configuration
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(levelname)s | %(message)s')
 
25
 
26
  def extract_text_from_pdf(pdf_path):
27
  text = ""
28
+ try:
29
+ with open(pdf_path, 'rb') as file:
30
+ pdf_reader = PyPDF2.PdfReader(file)
31
+ for page_num, page in enumerate(pdf_reader.pages, 1):
32
+ page_text = page.extract_text()
33
+ if page_text.strip():
34
+ text += page_text
35
+ else:
36
+ # If text extraction fails, convert the page to an image
37
+ images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
38
+ if images:
39
+ text += f"[Image on page {page_num}]\n"
40
+ except Exception as e:
41
+ logging.error(f"Error processing PDF {pdf_path}: {str(e)}")
42
+ text += f"[Error processing PDF: {str(e)}]\n"
43
  return text
44
 
45
  def load_docx_file(docx_path):
46
+ try:
47
+ doc = docx.Document(docx_path)
48
+ return '\n'.join([para.text for para in doc.paragraphs])
49
+ except Exception as e:
50
+ logging.error(f"Error processing DOCX {docx_path}: {str(e)}")
51
+ return f"[Error processing DOCX: {str(e)}]\n"
52
 
53
  def load_txt_file(txt_path):
54
+ try:
55
+ with open(txt_path, 'r', encoding='utf-8') as f:
56
+ return f.read()
57
+ except Exception as e:
58
+ logging.error(f"Error processing TXT {txt_path}: {str(e)}")
59
+ return f"[Error processing TXT: {str(e)}]\n"
60
 
61
  def load_file_based_on_extension(file_path):
62
  if file_path.lower().endswith('.pdf'):
 
78
  return "No files uploaded.", None
79
 
80
  documents = []
81
+ error_messages = []
82
  for file_path in files:
83
  try:
84
  text = load_file_based_on_extension(file_path)
85
  documents.append(Document(text=text))
86
  except Exception as e:
87
+ error_message = f"Error processing file {file_path}: {str(e)}"
88
+ logging.error(error_message)
89
+ error_messages.append(error_message)
90
 
91
  if documents:
92
+ try:
93
+ embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=api_key)
94
+ vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
95
+ success_message = f"Successfully indexed {len(documents)} files."
96
+ if error_messages:
97
+ return f"{success_message}\nErrors: {'; '.join(error_messages)}", vector_index
98
+ return success_message, vector_index
99
+ except Exception as e:
100
+ return f"Error creating index: {str(e)}", None
101
  else:
102
+ return f"No valid documents were indexed. Errors: {'; '.join(error_messages)}", None
103
 
104
  def calculate_similarity(response, ground_truth):
105
  response_embedding = sentence_model.encode(response, convert_to_tensor=True)