Spaces:
Sleeping
Sleeping
import streamlit as st | |
import os | |
import pandas as pd | |
from azure.ai.formrecognizer import DocumentAnalysisClient | |
from azure.core.credentials import AzureKeyCredential | |
from PyPDF2 import PdfReader, PdfWriter | |
from io import BytesIO | |
YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"] | |
YOUR_KEY = os.environ["YOUR_KEY"] | |
st.set_page_config( | |
page_title="PDF Table Extractor", | |
layout="centered", | |
initial_sidebar_state="auto" | |
) | |
document_analysis_client = DocumentAnalysisClient( | |
endpoint=YOUR_ENDPOINT, | |
credential=AzureKeyCredential(YOUR_KEY) | |
) | |
# Function to convert table cells to pandas DataFrame | |
def table2pandas(table): | |
data = [] | |
for cell in table.cells: | |
while len(data) <= cell.row_index: | |
data.append([]) | |
while len(data[cell.row_index]) <= cell.column_index: | |
data[cell.row_index].append("") | |
data[cell.row_index][cell.column_index] = cell.content | |
return pd.DataFrame(data) | |
# Function to split PDF into pages | |
def split_pdf_to_pages(filepath): | |
input_pdf = PdfReader(filepath) | |
pages = [] | |
for page_num in range(len(input_pdf.pages)): | |
output_pdf = PdfWriter() | |
output_pdf.add_page(input_pdf.pages[page_num]) | |
page_stream = BytesIO() | |
output_pdf.write(page_stream) | |
page_stream.seek(0) | |
pages.append(page_stream.read()) | |
return pages | |
# Streamlit app | |
def main(): | |
st.title("PDF Table Extractor") | |
# Upload PDF file | |
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"]) | |
if uploaded_file is not None: | |
# Temporarily save uploaded PDF | |
os.makedirs("temp_files", exist_ok=True) | |
temp_filepath = os.path.join("temp_files", uploaded_file.name) | |
with open(temp_filepath, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.text("Uploaded successfully. Extracting tables...") | |
# Process the uploaded PDF | |
pages = split_pdf_to_pages(temp_filepath) | |
for page_num, page_bytes in enumerate(pages): | |
poller = document_analysis_client.begin_analyze_document( | |
"prebuilt-layout", document=page_bytes) | |
result = poller.result() | |
if hasattr(result, 'tables') and result.tables: | |
for table_num, table in enumerate(result.tables): | |
table_df = table2pandas(table) | |
st.write(table_df) # Display table in Streamlit (optional) | |
# Provide a download link for the CSV file | |
csv_file = table_df.to_csv(index=False).encode('utf-8') | |
st.download_button( | |
label="Download CSV", | |
data=csv_file, | |
file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv", | |
mime="text/csv" | |
) | |
st.success("Tables extracted and saved successfully!") | |
if __name__ == "__main__": | |
main() | |