File size: 3,093 Bytes
190c508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import streamlit as st
import os
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO

YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
YOUR_KEY = os.environ["YOUR_KEY"]

st.set_page_config(
    page_title="PDF Table Extractor",
    layout="centered",
    initial_sidebar_state="auto"
)

document_analysis_client = DocumentAnalysisClient(
    endpoint=YOUR_ENDPOINT,
    credential=AzureKeyCredential(YOUR_KEY)
)

# Function to convert table cells to pandas DataFrame


def table2pandas(table):
    data = []
    for cell in table.cells:
        while len(data) <= cell.row_index:
            data.append([])
        while len(data[cell.row_index]) <= cell.column_index:
            data[cell.row_index].append("")
        data[cell.row_index][cell.column_index] = cell.content
    return pd.DataFrame(data)

# Function to split PDF into pages


def split_pdf_to_pages(filepath):
    input_pdf = PdfReader(filepath)
    pages = []
    for page_num in range(len(input_pdf.pages)):
        output_pdf = PdfWriter()
        output_pdf.add_page(input_pdf.pages[page_num])
        page_stream = BytesIO()
        output_pdf.write(page_stream)
        page_stream.seek(0)
        pages.append(page_stream.read())
    return pages

# Streamlit app


def main():
    st.title("PDF Table Extractor")

    # Upload PDF file
    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])

    if uploaded_file is not None:
        # Temporarily save uploaded PDF
        os.makedirs("temp_files", exist_ok=True)
        temp_filepath = os.path.join("temp_files", uploaded_file.name)
        with open(temp_filepath, "wb") as f:
            f.write(uploaded_file.getbuffer())

        st.text("Uploaded successfully. Extracting tables...")

        # Process the uploaded PDF
        pages = split_pdf_to_pages(temp_filepath)
        for page_num, page_bytes in enumerate(pages):
            poller = document_analysis_client.begin_analyze_document(
                "prebuilt-layout", document=page_bytes)
            result = poller.result()

            if hasattr(result, 'tables') and result.tables:
                for table_num, table in enumerate(result.tables):
                    table_df = table2pandas(table)
                    st.write(table_df)  # Display table in Streamlit (optional)

                    # Provide a download link for the CSV file
                    csv_file = table_df.to_csv(index=False).encode('utf-8')
                    st.download_button(
                        label="Download CSV",
                        data=csv_file,
                        file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
                        mime="text/csv"
                    )

        st.success("Tables extracted and saved successfully!")


if __name__ == "__main__":
    main()