import PyPDF2 from pprint import pprint from getpass import getpass from haystack import Pipeline from haystack.schema import Document from haystack.nodes import BM25Retriever from haystack.document_stores import InMemoryDocumentStore from haystack.nodes import PromptTemplate, PromptNode import gradio as gr import os HF_TOKEN = getpass("Enter Token") from huggingface_hub import notebook_login notebook_login() # Process and retrieve answers def process_invoice(file, hf_token, questions): # Read file content file_content = file.read() document = Document(content=file_content) docs = [document] document_store = InMemoryDocumentStore(use_bm25=True) document_store.write_documents(docs) retriever = BM25Retriever(document_store, top_k=2) qa_template = PromptTemplate(prompt= """ Using exclusively the information contained in the context, answer only the question asked without adding suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice respond: "Not sure because not relevant to the context. Context: {join(documents)}; Question: {query} """) prompt_node = PromptNode( model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1', api_key=hf_token, default_prompt_template=qa_template, max_length=500, model_kwargs={"model_max_length": 5000} ) rag_pipeline = Pipeline() rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) answers = {} for question in questions.split(','): result = rag_pipeline.run(query=question.strip()) answers[question] = result["results"][0].strip() return answers # Gradio interface def gradio_interface(file, hf_token, questions): answers = process_invoice(file, hf_token, questions) return answers interface = gr.Interface( fn=gradio_interface, inputs=[ gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF or Image)"), gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"), gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas") ], outputs="json", title="Invoice Data Extraction", description="Upload an invoice PDF or image, provide your Hugging Face token, and get the extracted data based on your questions." ) if __name__ == "__main__": interface.launch()