Spaces:

skylord
/

annual-reports

Sleeping

File size: 3,093 Bytes

190c508

import streamlit as st
import os
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO

YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
YOUR_KEY = os.environ["YOUR_KEY"]

st.set_page_config(
    page_title="PDF Table Extractor",
    layout="centered",
    initial_sidebar_state="auto"
)

document_analysis_client = DocumentAnalysisClient(
    endpoint=YOUR_ENDPOINT,
    credential=AzureKeyCredential(YOUR_KEY)
)

# Function to convert table cells to pandas DataFrame


def table2pandas(table):
    data = []
    for cell in table.cells:
        while len(data) <= cell.row_index:
            data.append([])
        while len(data[cell.row_index]) <= cell.column_index:
            data[cell.row_index].append("")
        data[cell.row_index][cell.column_index] = cell.content
    return pd.DataFrame(data)

# Function to split PDF into pages


def split_pdf_to_pages(filepath):
    input_pdf = PdfReader(filepath)
    pages = []
    for page_num in range(len(input_pdf.pages)):
        output_pdf = PdfWriter()
        output_pdf.add_page(input_pdf.pages[page_num])
        page_stream = BytesIO()
        output_pdf.write(page_stream)
        page_stream.seek(0)
        pages.append(page_stream.read())
    return pages

# Streamlit app


def main():
    st.title("PDF Table Extractor")

    # Upload PDF file
    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

    if uploaded_file is not None:
        # Temporarily save uploaded PDF
        os.makedirs("temp_files", exist_ok=True)
        temp_filepath = os.path.join("temp_files", uploaded_file.name)
        with open(temp_filepath, "wb") as f:
            f.write(uploaded_file.getbuffer())

        st.text("Uploaded successfully. Extracting tables...")

        # Process the uploaded PDF
        pages = split_pdf_to_pages(temp_filepath)
        for page_num, page_bytes in enumerate(pages):
            poller = document_analysis_client.begin_analyze_document(
                "prebuilt-layout", document=page_bytes)
            result = poller.result()

            if hasattr(result, 'tables') and result.tables:
                for table_num, table in enumerate(result.tables):
                    table_df = table2pandas(table)
                    st.write(table_df)  # Display table in Streamlit (optional)

                    # Provide a download link for the CSV file
                    csv_file = table_df.to_csv(index=False).encode('utf-8')
                    st.download_button(
                        label="Download CSV",
                        data=csv_file,
                        file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
                        mime="text/csv"
                    )

        st.success("Tables extracted and saved successfully!")


if __name__ == "__main__":
    main()