Spaces:
Sleeping
Sleeping
File size: 3,093 Bytes
190c508 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
import os
import pandas as pd
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO
YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"]
YOUR_KEY = os.environ["YOUR_KEY"]
st.set_page_config(
page_title="PDF Table Extractor",
layout="centered",
initial_sidebar_state="auto"
)
document_analysis_client = DocumentAnalysisClient(
endpoint=YOUR_ENDPOINT,
credential=AzureKeyCredential(YOUR_KEY)
)
# Function to convert table cells to pandas DataFrame
def table2pandas(table):
data = []
for cell in table.cells:
while len(data) <= cell.row_index:
data.append([])
while len(data[cell.row_index]) <= cell.column_index:
data[cell.row_index].append("")
data[cell.row_index][cell.column_index] = cell.content
return pd.DataFrame(data)
# Function to split PDF into pages
def split_pdf_to_pages(filepath):
input_pdf = PdfReader(filepath)
pages = []
for page_num in range(len(input_pdf.pages)):
output_pdf = PdfWriter()
output_pdf.add_page(input_pdf.pages[page_num])
page_stream = BytesIO()
output_pdf.write(page_stream)
page_stream.seek(0)
pages.append(page_stream.read())
return pages
# Streamlit app
def main():
st.title("PDF Table Extractor")
# Upload PDF file
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_file is not None:
# Temporarily save uploaded PDF
os.makedirs("temp_files", exist_ok=True)
temp_filepath = os.path.join("temp_files", uploaded_file.name)
with open(temp_filepath, "wb") as f:
f.write(uploaded_file.getbuffer())
st.text("Uploaded successfully. Extracting tables...")
# Process the uploaded PDF
pages = split_pdf_to_pages(temp_filepath)
for page_num, page_bytes in enumerate(pages):
poller = document_analysis_client.begin_analyze_document(
"prebuilt-layout", document=page_bytes)
result = poller.result()
if hasattr(result, 'tables') and result.tables:
for table_num, table in enumerate(result.tables):
table_df = table2pandas(table)
st.write(table_df) # Display table in Streamlit (optional)
# Provide a download link for the CSV file
csv_file = table_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download CSV",
data=csv_file,
file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv",
mime="text/csv"
)
st.success("Tables extracted and saved successfully!")
if __name__ == "__main__":
main()
|