|
from pprint import pprint |
|
from getpass import getpass |
|
from PyPDF2 import PdfReader |
|
import gradio as gr |
|
import os |
|
from transformers import pipeline |
|
|
|
|
|
def read_pdf(pdf_path): |
|
content = "" |
|
reader = PdfReader(pdf_path) |
|
for page in reader.pages: |
|
content += page.extract_text() |
|
return content |
|
|
|
|
|
def process_invoice(file, hf_token, questions): |
|
|
|
pdf_content = read_pdf(file.name) |
|
|
|
|
|
qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token) |
|
|
|
answers = {} |
|
for question in questions.split(','): |
|
result = qa_pipeline(question=question.strip(), context=pdf_content) |
|
answers[question] = result['answer'] |
|
|
|
return answers |
|
|
|
|
|
def gradio_interface(file, hf_token, questions): |
|
answers = process_invoice(file, hf_token, questions) |
|
return answers |
|
|
|
interface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"), |
|
gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"), |
|
gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas") |
|
], |
|
outputs="json", |
|
title="Invoice Data Extraction", |
|
description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions." |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|