Spaces:

skylord
/

annual-reports

Sleeping

App Files Files Community

annual-reports / app.py

skylord

Upload app.py

190c508 verified 7 months ago

raw

history blame contribute delete

3.09 kB

	import streamlit as st
	import os
	import pandas as pd
	from azure.ai.formrecognizer import DocumentAnalysisClient
	from azure.core.credentials import AzureKeyCredential
	from PyPDF2 import PdfReader, PdfWriter
	from io import BytesIO

	YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
	YOUR_KEY = os.environ["YOUR_KEY"]

	st.set_page_config(
	page_title="PDF Table Extractor",
	layout="centered",
	initial_sidebar_state="auto"
	)

	document_analysis_client = DocumentAnalysisClient(
	endpoint=YOUR_ENDPOINT,
	credential=AzureKeyCredential(YOUR_KEY)
	)

	# Function to convert table cells to pandas DataFrame


	def table2pandas(table):
	data = []
	for cell in table.cells:
	while len(data) <= cell.row_index:
	data.append([])
	while len(data[cell.row_index]) <= cell.column_index:
	data[cell.row_index].append("")
	data[cell.row_index][cell.column_index] = cell.content
	return pd.DataFrame(data)

	# Function to split PDF into pages


	def split_pdf_to_pages(filepath):
	input_pdf = PdfReader(filepath)
	pages = []
	for page_num in range(len(input_pdf.pages)):
	output_pdf = PdfWriter()
	output_pdf.add_page(input_pdf.pages[page_num])
	page_stream = BytesIO()
	output_pdf.write(page_stream)
	page_stream.seek(0)
	pages.append(page_stream.read())
	return pages

	# Streamlit app


	def main():
	st.title("PDF Table Extractor")

	# Upload PDF file
	uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

	if uploaded_file is not None:
	# Temporarily save uploaded PDF
	os.makedirs("temp_files", exist_ok=True)
	temp_filepath = os.path.join("temp_files", uploaded_file.name)
	with open(temp_filepath, "wb") as f:
	f.write(uploaded_file.getbuffer())

	st.text("Uploaded successfully. Extracting tables...")

	# Process the uploaded PDF
	pages = split_pdf_to_pages(temp_filepath)
	for page_num, page_bytes in enumerate(pages):
	poller = document_analysis_client.begin_analyze_document(
	"prebuilt-layout", document=page_bytes)
	result = poller.result()

	if hasattr(result, 'tables') and result.tables:
	for table_num, table in enumerate(result.tables):
	table_df = table2pandas(table)
	st.write(table_df) # Display table in Streamlit (optional)

	# Provide a download link for the CSV file
	csv_file = table_df.to_csv(index=False).encode('utf-8')
	st.download_button(
	label="Download CSV",
	data=csv_file,
	file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
	mime="text/csv"
	)

	st.success("Tables extracted and saved successfully!")


	if __name__ == "__main__":
	main()