annual-reports / app.py
skylord's picture
Upload app.py
190c508 verified
import streamlit as st
import os
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO
YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
YOUR_KEY = os.environ["YOUR_KEY"]
st.set_page_config(
page_title="PDF Table Extractor",
layout="centered",
initial_sidebar_state="auto"
)
document_analysis_client = DocumentAnalysisClient(
endpoint=YOUR_ENDPOINT,
credential=AzureKeyCredential(YOUR_KEY)
)
# Function to convert table cells to pandas DataFrame
def table2pandas(table):
data = []
for cell in table.cells:
while len(data) <= cell.row_index:
data.append([])
while len(data[cell.row_index]) <= cell.column_index:
data[cell.row_index].append("")
data[cell.row_index][cell.column_index] = cell.content
return pd.DataFrame(data)
# Function to split PDF into pages
def split_pdf_to_pages(filepath):
input_pdf = PdfReader(filepath)
pages = []
for page_num in range(len(input_pdf.pages)):
output_pdf = PdfWriter()
output_pdf.add_page(input_pdf.pages[page_num])
page_stream = BytesIO()
output_pdf.write(page_stream)
page_stream.seek(0)
pages.append(page_stream.read())
return pages
# Streamlit app
def main():
st.title("PDF Table Extractor")
# Upload PDF file
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_file is not None:
# Temporarily save uploaded PDF
os.makedirs("temp_files", exist_ok=True)
temp_filepath = os.path.join("temp_files", uploaded_file.name)
with open(temp_filepath, "wb") as f:
f.write(uploaded_file.getbuffer())
st.text("Uploaded successfully. Extracting tables...")
# Process the uploaded PDF
pages = split_pdf_to_pages(temp_filepath)
for page_num, page_bytes in enumerate(pages):
poller = document_analysis_client.begin_analyze_document(
"prebuilt-layout", document=page_bytes)
result = poller.result()
if hasattr(result, 'tables') and result.tables:
for table_num, table in enumerate(result.tables):
table_df = table2pandas(table)
st.write(table_df) # Display table in Streamlit (optional)
# Provide a download link for the CSV file
csv_file = table_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download CSV",
data=csv_file,
file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
mime="text/csv"
)
st.success("Tables extracted and saved successfully!")
if __name__ == "__main__":
main()