muradkhan commited on
Commit
45a0b43
1 Parent(s): 0c06edd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -39
app.py CHANGED
@@ -1,52 +1,27 @@
1
  import PyPDF2
2
  from pprint import pprint
 
3
  from haystack import Pipeline
4
  from haystack.schema import Document
5
  from haystack.nodes import BM25Retriever
6
  from haystack.document_stores import InMemoryDocumentStore
7
- from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
8
- from pdf2image import convert_from_path
9
- import pytesseract
10
- from PIL import Image
11
  import gradio as gr
12
  import os
13
- from pydantic import BaseModel
14
 
15
- # Function to extract text from a PDF file using OCR
16
- def extract_text_from_pdf(pdf_path):
17
- text = ""
18
- # Convert PDF pages to images
19
- images = convert_from_path(pdf_path)
20
- for image in images:
21
- # Perform OCR on the image
22
- text += pytesseract.image_to_string(image)
23
- return text
24
-
25
- class Config(BaseModel):
26
- class Config:
27
- arbitrary_types_allowed = True
28
 
29
  # Process and retrieve answers
30
- def process_invoice(pdf, hf_token, questions):
31
- # Extract text from the PDF
32
- extracted_text = extract_text_from_pdf(pdf.name)
33
- document = Document(content=extracted_text)
34
  docs = [document]
35
 
36
- # Initializing the processor
37
- processor = PreProcessor(
38
- clean_empty_lines=True,
39
- clean_whitespace=True,
40
- clean_header_footer=True,
41
- split_by="word",
42
- split_length=500,
43
- split_respect_sentence_boundary=True,
44
- split_overlap=0,
45
- )
46
-
47
- preprocessed_docs = processor.process(docs)
48
  document_store = InMemoryDocumentStore(use_bm25=True)
49
- document_store.write_documents(preprocessed_docs)
50
  retriever = BM25Retriever(document_store, top_k=2)
51
 
52
  qa_template = PromptTemplate(prompt=
@@ -78,20 +53,20 @@ def process_invoice(pdf, hf_token, questions):
78
  return answers
79
 
80
  # Gradio interface
81
- def gradio_interface(pdf, hf_token, questions):
82
- answers = process_invoice(pdf, hf_token, questions)
83
  return answers
84
 
85
  interface = gr.Interface(
86
  fn=gradio_interface,
87
  inputs=[
88
- gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
89
  gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
90
  gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
91
  ],
92
  outputs="json",
93
  title="Invoice Data Extraction",
94
- description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
95
  )
96
 
97
  if __name__ == "__main__":
 
1
  import PyPDF2
2
  from pprint import pprint
3
+ from getpass import getpass
4
  from haystack import Pipeline
5
  from haystack.schema import Document
6
  from haystack.nodes import BM25Retriever
7
  from haystack.document_stores import InMemoryDocumentStore
8
+ from haystack.nodes import PromptTemplate, PromptNode
 
 
 
9
  import gradio as gr
10
  import os
 
11
 
12
+ HF_TOKEN = getpass("Enter Token")
13
+ from huggingface_hub import notebook_login
14
+ notebook_login()
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Process and retrieve answers
17
+ def process_invoice(file, hf_token, questions):
18
+ # Read file content
19
+ file_content = file.read()
20
+ document = Document(content=file_content)
21
  docs = [document]
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  document_store = InMemoryDocumentStore(use_bm25=True)
24
+ document_store.write_documents(docs)
25
  retriever = BM25Retriever(document_store, top_k=2)
26
 
27
  qa_template = PromptTemplate(prompt=
 
53
  return answers
54
 
55
  # Gradio interface
56
+ def gradio_interface(file, hf_token, questions):
57
+ answers = process_invoice(file, hf_token, questions)
58
  return answers
59
 
60
  interface = gr.Interface(
61
  fn=gradio_interface,
62
  inputs=[
63
+ gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF or Image)"),
64
  gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
65
  gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
66
  ],
67
  outputs="json",
68
  title="Invoice Data Extraction",
69
+ description="Upload an invoice PDF or image, provide your Hugging Face token, and get the extracted data based on your questions."
70
  )
71
 
72
  if __name__ == "__main__":