|
import gradio as gr |
|
import fitz |
|
from huggingface_hub import InferenceClient |
|
import os |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
try: |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
except Exception as e: |
|
return f"Error extracting text from PDF: {str(e)}" |
|
|
|
|
|
def extract_invoice_details_from_text(api_key, files, model_name, prompt): |
|
try: |
|
|
|
if not api_key.strip(): |
|
return "Error: Please provide a valid Hugging Face API key." |
|
|
|
|
|
client = InferenceClient(api_key=api_key) |
|
|
|
|
|
all_extracted_data = [] |
|
|
|
|
|
default_prompt = ( |
|
"Can you please parse below details from attached documents in excel format?\n" |
|
"Information to extract: DATE, NAME & ADDRESS OF BUYER, Item Code, HSN CODE, UOM, Qty, Unit Price\n\nInvoice text:\n" |
|
) |
|
user_prompt = prompt if prompt.strip() else default_prompt |
|
|
|
|
|
for file in files: |
|
pdf_text = extract_text_from_pdf(file.name) |
|
if not pdf_text.startswith("Error"): |
|
response = client.chat.completions.create( |
|
model=model_name, |
|
messages=[{"role": "user", "content": user_prompt + pdf_text}], |
|
max_tokens=2000 |
|
) |
|
extracted_data = response['choices'][0]['message']['content'] |
|
all_extracted_data.append(f"File: {file.name}\n{extracted_data.strip()}") |
|
else: |
|
all_extracted_data.append(f"File: {file.name}\n{pdf_text}") |
|
|
|
return "\n\n".join(all_extracted_data) |
|
except Exception as e: |
|
return f"Error occurred while processing: {str(e)}" |
|
|
|
|
|
def gradio_interface(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# PDF Data Extraction") |
|
gr.Markdown( |
|
"Upload your PDF files, select a model, and provide a prompt to extract data." |
|
) |
|
|
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
label="Hugging Face API Key", |
|
placeholder="Enter your Hugging Face API key", |
|
type="password", |
|
) |
|
|
|
files = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple") |
|
prompt = gr.Textbox( |
|
label="Custom Prompt (optional)", |
|
placeholder="Enter custom prompt here (optional)", |
|
lines=4, |
|
) |
|
model_name = gr.Dropdown( |
|
label="Select Model", |
|
choices=[ |
|
"Qwen/Qwen2.5-Coder-32B-Instruct", |
|
"Qwen/Qwen2.5-72B-Instruct", |
|
"meta-llama/Llama-3.2-1B-Instruct", |
|
"mistralai/Mistral-7B-Instruct-v0.3", |
|
"meta-llama/Meta-Llama-3-8B-Instruct", |
|
"microsoft/Phi-3.5-mini-instruct", |
|
"mistralai/Mixtral-8x7B-Instruct-v0.1", |
|
"microsoft/Phi-3-mini-4k-instruct", |
|
], |
|
value="Qwen/Qwen2.5-Coder-32B-Instruct", |
|
) |
|
|
|
output = gr.Textbox(label="Extracted Data", lines=10) |
|
|
|
submit_button = gr.Button("Extract Data") |
|
|
|
|
|
submit_button.click( |
|
extract_invoice_details_from_text, |
|
inputs=[api_key, files, model_name, prompt], |
|
outputs=[output], |
|
) |
|
|
|
return demo |
|
|
|
|
|
app = gradio_interface() |
|
app.launch() |
|
|