Spaces:

ravithejads
/

docsques

Runtime error

App Files Files Community

docsques / app.py

ravithejads

Create app.py

6844495 almost 2 years ago

raw

history blame contribute delete

6.88 kB

	from gpt_index import Document, GPTListIndex
	import gradio as gr
	import openai
	import os
	import PyPDF2
	import docx
	import pytesseract
	from PIL import Image

	def pdftotext(file_name):
	"""
	Function to extract text from .pdf format files
	"""

	text = []
	# Open the PDF file in read-binary mode
	with open(file_name, 'rb') as file:
	# Create a PDF object
	pdf = PyPDF2.PdfReader(file)

	# Get the number of pages in the PDF document
	num_pages = len(pdf.pages)

	# Iterate over every page
	for page in range(num_pages):
	# Extract the text from the page
	result = pdf.pages[page].extract_text()
	text.append(result)

	text = "\n".join(text)

	return text

	def docxtotext(file_name):
	"""
	Function to read .docx format files
	"""
	# Open the Word document
	document = docx.Document(file_name)

	# Extract the text from the document
	text = '\n'.join([paragraph.text for paragraph in document.paragraphs])

	return text

	def readtextfile(file_name):
	"""
	Function to read .txt format files
	"""

	# Open the Text document
	with open(file_name, 'r') as file:
	text = file.read()

	return text

	def imagetotext(file_name):
	"""
	Function to extract text from images
	"""
	# Open the image using PIL
	image = Image.open(file_name)

	# Extract the text from the image
	text = pytesseract.image_to_string(image)

	return text

	def preprocesstext(text):
	"""
	Function to preprocess text
	"""
	# Split the string into lines
	lines = text.splitlines()
	# Use a list comprehension to filter out empty lines
	lines = [line for line in lines if line.strip()]
	# Join the modified lines back into a single string
	text = '\n'.join(lines)

	return text

	def processfiles(files):
	"""
	Function to extract text from documents
	"""
	textlist = []

	# Iterate over provided files
	for file in files:
	# Get file name
	file_name = file.name
	# Get extention of file name
	ext = file_name.split(".")[-1].lower()

	# Process document based on extention
	if ext == "pdf":
	text = pdftotext(file_name)
	elif ext == "docx":
	text = docxtotext(file_name)
	elif ext == "txt":
	text = readtextfile(file_name)
	elif ext in ["png", "jpg", "jpeg"]:
	text = imagetotext(file_name)
	else:
	text = ""

	# Preprocess text
	text = preprocesstext(text)

	# Append the text to final result
	textlist.append(text)

	return textlist

	def createdocuments(textlist):
	"""
	Function to create documents as needed for indexing.
	"""
	documents = []
	# Create Document for indexing
	for text in textlist:
	documents.append(Document(text))

	return documents

	def fileformatvaliditycheck(files):
	"""
	Function to check validity of file formats
	"""

	for file1 in files:
	file_name = file1.name
	# Get extention of file name
	ext = file_name.split(".")[-1].lower()

	if ext not in ["pdf", "txt", "docx", "png", "jpg", "jpeg"]:
	return False
	return True

	def openaiapikeyvaliditycheck(openaikey):
	"""
	Function to check validity of openai key
	"""
	# Set the API key
	openai.api_key = openaikey
	# Test the API key by making a request to the OpenAI API
	try:
	response = openai.Model.list()
	return "Valid OpenAI API key"
	except openai.OpenAIError:
	apikeylink = "https://beta.openai.com/account/api-keys"
	return f"Incorrect OpenAI API key provided: {openaikey}. You can find your OpenAI API key here - {apikeylink}"

	def createindex(files, openaikey):
	"""
	Function to create index
	"""

	# Basic Checks
	if not files:
	return "Upload file before proceeding further."

	fileformatvalidity = fileformatvaliditycheck(files)

	if not fileformatvalidity:
	return "Please upload documents in pdf/txt/docx/png/jpg/jpeg format only."

	if not openaikey:
	return "Please enter your openai key."

	openaiapikeyvality = openaiapikeyvaliditycheck(openaikey)

	if openaiapikeyvality != "Valid OpenAI API key":
	return openaiapikeyvality

	# Store openai key in environment
	os.environ['OPENAI_API_KEY'] = openaikey

	# Process the Documents
	doctextlist = processfiles(files)
	documents = createdocuments(doctextlist)

	# Create index
	index = GPTListIndex(documents, chunk_size_limit = 3500)
	# Save index
	index.save_to_disk('index.json')

	return "Uploading documents successfully. OpenAI API Key provided is Valid."

	def docques(query, openaikey):
	"""
	Function to for quering on the index created
	"""

	# Store openai key in environment
	os.environ['OPENAI_API_KEY'] = openaikey

	# Load index
	index = GPTListIndex.load_from_disk('index.json')

	# Query based on index
	response = index.query(query, response_mode="tree_summarize")

	return response

	def cleartext(query, output):
	"""
	Function to clear text
	"""
	return ["", ""]

	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	<h1><center><b>DocQues</center></h1>

	""")
	gr.Markdown(
	"""
	This app answers your queries on longer and multiple documents (pdf/docx/txt/png/jpeg/jpg) you upload. It uses <a href = "https://github.com/jerryjliu/gpt_index">GPT-Index</a> and OpenAI GPT3 in the backend, get your
	<a href = "https://beta.openai.com/account/api-keys">Openai key here</a> before proceeding further.\n
	""")
	gr.Markdown(
	"""
	<br>Use this space effectively by following below 2 step process.</br>
	Step-1
	<br>- Upload pdf/docx/txt/png/jpeg/jpg format documents.
	<br>- Enter your openai key.
	<br>- Click upload and wait to see if upload is successful or not. </br>
	Step-2
	<br>- Enter your query.
	<br>- Click submit.
	<br>- Check Answer </br>
	Please refer to the GitHub repo this Space is based on, here - <a href = "https://github.com/ravi03071991/DocQues">DocQues</a> .
	"""
	)
	with gr.Row():
	with gr.Column():
	files = gr.File(label = "Upload pdf/docx/txt format documents.", file_count="multiple")
	openaikey = gr.Textbox(lines = 1, label = "Enter your OpenAI Key.")
	upload_button = gr.Button("Upload")
	query = gr.Textbox(lines = 2, label = "Enter Your Question.")
	submit_button = gr.Button("Submit")
	with gr.Column():
	upload_output = gr.Textbox(label = "Upload/ Error.")
	ans_output = gr.Textbox(label = "Answer.")
	clear_button = gr.Button("Clear")

	# Upload button for uploading files and openai key.
	upload_button.click(createindex, inputs=[files, openaikey], outputs= [upload_output] )

	# Submit button for submitting query.
	submit_button.click(docques, inputs=[query, openaikey], outputs= [ans_output] )

	# Clear button for clearing query and answer.
	clear_button.click(cleartext, inputs=[query, ans_output], outputs= [query, ans_output] )

	demo.launch()