indemo / app.py
muradkhan's picture
Update app.py
86d3138 verified
raw
history blame
1.58 kB
from pprint import pprint
from getpass import getpass
from PyPDF2 import PdfReader
import gradio as gr
import os
from transformers import pipeline
# Function to read PDF file content directly
def read_pdf(pdf_path):
content = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
content += page.extract_text()
return content
# Process and retrieve answers
def process_invoice(file, hf_token, questions):
# Read the PDF content directly
pdf_content = read_pdf(file.name)
# Initialize the Hugging Face pipeline
qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
answers = {}
for question in questions.split(','):
result = qa_pipeline(question=question.strip(), context=pdf_content)
answers[question] = result['answer']
return answers
# Gradio interface
def gradio_interface(file, hf_token, questions):
answers = process_invoice(file, hf_token, questions)
return answers
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
],
outputs="json",
title="Invoice Data Extraction",
description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
)
if __name__ == "__main__":
interface.launch()