Update app.py
Browse files
app.py
CHANGED
@@ -12,91 +12,95 @@ def extract_text_from_pdf(pdf_path):
|
|
12 |
text += page.get_text()
|
13 |
return text
|
14 |
except Exception as e:
|
15 |
-
|
16 |
-
return None
|
17 |
|
18 |
# Function to send extracted text to the model and get the details
|
19 |
-
def extract_invoice_details_from_text(
|
20 |
try:
|
21 |
-
#
|
22 |
-
|
|
|
23 |
|
24 |
-
#
|
25 |
-
|
26 |
-
model=model_name, # Use the model selected by the user
|
27 |
-
messages=[{"role": "user", "content": prompt + text}], # Append extracted PDF text to prompt
|
28 |
-
max_tokens=2000 # Adjust token size as needed
|
29 |
-
)
|
30 |
-
# Ensure the response format is correct and extract the content
|
31 |
-
return response['choices'][0]['message']['content']
|
32 |
-
except Exception as e:
|
33 |
-
print(f"Error occurred while processing the request: {str(e)}")
|
34 |
-
return "Error occurred while processing the request."
|
35 |
|
36 |
-
#
|
37 |
-
|
38 |
-
if not api_key.strip():
|
39 |
-
return "Please provide a valid Hugging Face API key."
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
|
49 |
-
|
50 |
pdf_text = extract_text_from_pdf(file.name)
|
51 |
-
if not pdf_text:
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
except Exception as e:
|
59 |
-
print(f"Error processing file {file.name}: {str(e)}")
|
60 |
-
all_extracted_data.append(f"Error processing {file.name}: {str(e)}")
|
61 |
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
72 |
label="Select Model",
|
73 |
-
choices=[
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
)
|
76 |
-
prompt_input = gr.Textbox(
|
77 |
-
label="Custom Prompt",
|
78 |
-
placeholder="Enter your custom prompt here (leave blank to use default prompt).",
|
79 |
-
lines=3
|
80 |
-
)
|
81 |
-
api_key_input = gr.Textbox(
|
82 |
-
label="Hugging Face API Key",
|
83 |
-
placeholder="Enter your Hugging Face API key here.",
|
84 |
-
type="password",
|
85 |
-
lines=1
|
86 |
-
)
|
87 |
-
extract_button = gr.Button("Extract Details from PDF")
|
88 |
-
output_box = gr.Textbox(
|
89 |
-
label="Extracted Data",
|
90 |
-
placeholder="The extracted details will appear here.",
|
91 |
-
lines=15,
|
92 |
-
interactive=False
|
93 |
-
)
|
94 |
|
95 |
-
|
96 |
-
process_files,
|
97 |
-
inputs=[file_input, prompt_input, model_dropdown, api_key_input],
|
98 |
-
outputs=output_box
|
99 |
-
)
|
100 |
|
101 |
# Launch the app
|
|
|
102 |
app.launch()
|
|
|
12 |
text += page.get_text()
|
13 |
return text
|
14 |
except Exception as e:
|
15 |
+
return f"Error extracting text from PDF: {str(e)}"
|
|
|
16 |
|
17 |
# Function to send extracted text to the model and get the details
|
18 |
+
def extract_invoice_details_from_text(api_key, files, model_name, prompt):
|
19 |
try:
|
20 |
+
# Validate API key
|
21 |
+
if not api_key.strip():
|
22 |
+
return "Error: Please provide a valid Hugging Face API key."
|
23 |
|
24 |
+
# Initialize the InferenceClient
|
25 |
+
client = InferenceClient(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
# Prepare extracted data
|
28 |
+
all_extracted_data = []
|
|
|
|
|
29 |
|
30 |
+
# Default prompt if none is provided
|
31 |
+
default_prompt = (
|
32 |
+
"Can you please parse below details from attached documents in excel format?\n"
|
33 |
+
"Information to extract: DATE, NAME & ADDRESS OF BUYER, Item Code, HSN CODE, UOM, Qty, Unit Price\n\nInvoice text:\n"
|
34 |
+
)
|
35 |
+
user_prompt = prompt if prompt.strip() else default_prompt
|
36 |
|
37 |
+
# Process each uploaded file
|
38 |
+
for file in files:
|
39 |
pdf_text = extract_text_from_pdf(file.name)
|
40 |
+
if not pdf_text.startswith("Error"):
|
41 |
+
response = client.chat.completions.create(
|
42 |
+
model=model_name,
|
43 |
+
messages=[{"role": "user", "content": user_prompt + pdf_text}],
|
44 |
+
max_tokens=2000
|
45 |
+
)
|
46 |
+
extracted_data = response['choices'][0]['message']['content']
|
47 |
+
all_extracted_data.append(f"File: {file.name}\n{extracted_data.strip()}")
|
48 |
+
else:
|
49 |
+
all_extracted_data.append(f"File: {file.name}\n{pdf_text}")
|
50 |
|
51 |
+
return "\n\n".join(all_extracted_data)
|
52 |
+
except Exception as e:
|
53 |
+
return f"Error occurred while processing: {str(e)}"
|
|
|
|
|
|
|
54 |
|
55 |
+
# Define the Gradio interface
|
56 |
+
def gradio_interface():
|
57 |
+
with gr.Blocks() as demo:
|
58 |
+
gr.Markdown("# PDF Data Extraction")
|
59 |
+
gr.Markdown(
|
60 |
+
"Upload your PDF files, select a model, and provide a prompt to extract data."
|
61 |
+
)
|
62 |
|
63 |
+
with gr.Row():
|
64 |
+
api_key = gr.Textbox(
|
65 |
+
label="Hugging Face API Key",
|
66 |
+
placeholder="Enter your Hugging Face API key",
|
67 |
+
type="password",
|
68 |
+
)
|
69 |
|
70 |
+
files = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
|
71 |
+
prompt = gr.Textbox(
|
72 |
+
label="Custom Prompt (optional)",
|
73 |
+
placeholder="Enter custom prompt here (optional)",
|
74 |
+
lines=4,
|
75 |
+
)
|
76 |
+
model_name = gr.Dropdown(
|
77 |
label="Select Model",
|
78 |
+
choices=[
|
79 |
+
"Qwen/Qwen2.5-Coder-32B-Instruct",
|
80 |
+
"Qwen/Qwen2.5-72B-Instruct",
|
81 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
82 |
+
"mistralai/Mistral-7B-Instruct-v0.3",
|
83 |
+
"meta-llama/Meta-Llama-3-8B-Instruct",
|
84 |
+
"microsoft/Phi-3.5-mini-instruct",
|
85 |
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
86 |
+
"microsoft/Phi-3-mini-4k-instruct",
|
87 |
+
],
|
88 |
+
value="Qwen/Qwen2.5-Coder-32B-Instruct",
|
89 |
+
)
|
90 |
+
|
91 |
+
output = gr.Textbox(label="Extracted Data", lines=10)
|
92 |
+
|
93 |
+
submit_button = gr.Button("Extract Data")
|
94 |
+
|
95 |
+
# Set up the interaction
|
96 |
+
submit_button.click(
|
97 |
+
extract_invoice_details_from_text,
|
98 |
+
inputs=[api_key, files, model_name, prompt],
|
99 |
+
outputs=[output],
|
100 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
+
return demo
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Launch the app
|
105 |
+
app = gradio_interface()
|
106 |
app.launch()
|