Spaces:

Manojajj
/

Document-QA-without-RAG-Langchain

Running

App Files Files Community

Document-QA-without-RAG-Langchain / app.py

Manojajj

Update app.py

bc4901f verified 3 months ago

raw

history blame contribute delete

3.74 kB

	import gradio as gr
	import fitz # PyMuPDF for PDF extraction
	from huggingface_hub import InferenceClient
	import os

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_path):
	try:
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text
	except Exception as e:
	return f"Error extracting text from PDF: {str(e)}"

	# Function to send extracted text to the model and get the details
	def extract_invoice_details_from_text(api_key, files, model_name, prompt):
	try:
	# Validate API key
	if not api_key.strip():
	return "Error: Please provide a valid Hugging Face API key."

	# Initialize the InferenceClient
	client = InferenceClient(api_key=api_key)

	# Prepare extracted data
	all_extracted_data = []

	# Default prompt if none is provided
	default_prompt = (
	"Can you please parse below details from attached documents in excel format?\n"
	"Information to extract: DATE, NAME & ADDRESS OF BUYER, Item Code, HSN CODE, UOM, Qty, Unit Price\n\nInvoice text:\n"
	)
	user_prompt = prompt if prompt.strip() else default_prompt

	# Process each uploaded file
	for file in files:
	pdf_text = extract_text_from_pdf(file.name)
	if not pdf_text.startswith("Error"):
	response = client.chat.completions.create(
	model=model_name,
	messages=[{"role": "user", "content": user_prompt + pdf_text}],
	max_tokens=2000
	)
	extracted_data = response['choices'][0]['message']['content']
	all_extracted_data.append(f"File: {file.name}\n{extracted_data.strip()}")
	else:
	all_extracted_data.append(f"File: {file.name}\n{pdf_text}")

	return "\n\n".join(all_extracted_data)
	except Exception as e:
	return f"Error occurred while processing: {str(e)}"

	# Define the Gradio interface
	def gradio_interface():
	with gr.Blocks() as demo:
	gr.Markdown("# PDF Data Extraction")
	gr.Markdown(
	"Upload your PDF files, select a model, and provide a prompt to extract data."
	)

	with gr.Row():
	api_key = gr.Textbox(
	label="Hugging Face API Key",
	placeholder="Enter your Hugging Face API key",
	type="password",
	)

	files = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
	prompt = gr.Textbox(
	label="Custom Prompt (optional)",
	placeholder="Enter custom prompt here (optional)",
	lines=4,
	)
	model_name = gr.Dropdown(
	label="Select Model",
	choices=[
	"Qwen/Qwen2.5-Coder-32B-Instruct",
	"Qwen/Qwen2.5-72B-Instruct",
	"meta-llama/Llama-3.2-1B-Instruct",
	"mistralai/Mistral-7B-Instruct-v0.3",
	"meta-llama/Meta-Llama-3-8B-Instruct",
	"microsoft/Phi-3.5-mini-instruct",
	"mistralai/Mixtral-8x7B-Instruct-v0.1",
	"microsoft/Phi-3-mini-4k-instruct",
	],
	value="Qwen/Qwen2.5-Coder-32B-Instruct",
	)

	output = gr.Textbox(label="Extracted Data", lines=10)

	submit_button = gr.Button("Extract Data")

	# Set up the interaction
	submit_button.click(
	extract_invoice_details_from_text,
	inputs=[api_key, files, model_name, prompt],
	outputs=[output],
	)

	return demo

	# Launch the app
	app = gradio_interface()
	app.launch()