File size: 2,700 Bytes
ddb0d1e
 
45a0b43
ddb0d1e
 
 
 
45a0b43
ddb0d1e
 
 
45a0b43
 
 
d22351a
ddb0d1e
45a0b43
 
 
 
ddb0d1e
 
 
45a0b43
ddb0d1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45a0b43
 
ddb0d1e
 
 
 
 
45a0b43
ddb0d1e
 
 
 
 
45a0b43
ddb0d1e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import PyPDF2
from pprint import pprint
from getpass import getpass
from haystack import Pipeline
from haystack.schema import Document
from haystack.nodes import BM25Retriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PromptTemplate, PromptNode
import gradio as gr
import os

HF_TOKEN = getpass("Enter Token")
from huggingface_hub import notebook_login
notebook_login()

# Process and retrieve answers
def process_invoice(file, hf_token, questions):
    # Read file content
    file_content = file.read()
    document = Document(content=file_content)
    docs = [document]

    document_store = InMemoryDocumentStore(use_bm25=True)
    document_store.write_documents(docs)
    retriever = BM25Retriever(document_store, top_k=2)

    qa_template = PromptTemplate(prompt=
        """ Using exclusively the information contained in the context, answer only the question asked without adding
        suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
        context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
        respond: "Not sure because not relevant to the context.
        Context: {join(documents)};
        Question: {query}
        """)

    prompt_node = PromptNode(
        model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
        api_key=hf_token,
        default_prompt_template=qa_template,
        max_length=500,
        model_kwargs={"model_max_length": 5000}
    )

    rag_pipeline = Pipeline()
    rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
    rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

    answers = {}
    for question in questions.split(','):
        result = rag_pipeline.run(query=question.strip())
        answers[question] = result["results"][0].strip()

    return answers

# Gradio interface
def gradio_interface(file, hf_token, questions):
    answers = process_invoice(file, hf_token, questions)
    return answers

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF or Image)"),
        gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
        gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
    ],
    outputs="json",
    title="Invoice Data Extraction",
    description="Upload an invoice PDF or image, provide your Hugging Face token, and get the extracted data based on your questions."
)

if __name__ == "__main__":
    interface.launch()