async_pdf_chunck_api

Paused

File size: 4,314 Bytes

4c263a7
 
 
da2debc
 
4c263a7
1d9b15e
 
 
4c263a7
1d9b15e
 
d16f678
d92c861
9c62372
1d9b15e
4c263a7
 
 
 
 
 
 
 
 
1d9b15e
4c263a7
 
 
 
 
1d9b15e
4c263a7
 
 
1d9b15e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c263a7
 
 
 
 
1d9b15e
 
4c263a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da2debc
 
 
 
4c263a7
 
 
 
 
 
 
 
 
 
 
9c62372
4c263a7
 
 
 
a81ff23
1d9b15e
 
4c263a7
1d9b15e
da2debc
1d9b15e
9c62372
da2debc
1d9b15e
da2debc
 
1d9b15e
da2debc
 
4c263a7
1d9b15e
da2debc
 
 
 
 
 
 
1d9b15e
c69582b
0f0c7dc
da2debc
4c263a7
 
1d9b15e
da2debc

import io
import asyncio
import os
import uuid
import logging
from pathlib import Path
import aiohttp
from PyPDF2 import PdfReader, PdfWriter
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import JSONResponse
from aiohttp import FormData
import shutil

app = FastAPI()

async def call_pdfscraper(session, file_contents, pdf_name, processTables):
    headers = {"Origin": "http://localhost:8080"}
    url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
    data = FormData()
    data.add_field(
        "pdf",
        file_contents,
        filename=os.path.basename(pdf_name),
        content_type="application/pdf",
    )
    data.add_field("processTables", processTables)

    async with session.post(url, data=data, headers=headers) as resp:
        if resp.status == 200:
            response = await resp.json()
        else:
            return {}, pdf_name

    return response, pdf_name

async def execute_pdfscraper_async(file_path: str, processTables: str):
    chunk_list = os.listdir(file_path)
    chunk_byte_list = [
        (open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list
    ]
    response_list = []
    async with aiohttp.ClientSession() as session:
        tasks = [
            call_pdfscraper(session, file_all[0], file_all[1], processTables)
            for file_all in chunk_byte_list
        ]
        responses = await asyncio.gather(*tasks)
        for i, response in enumerate(responses):
            response_list.append(response[0])

    return response_list

def collect_pdfscraper_response(scrape_response_list):
    content_list = []
    tables_dict = {}
    table_count = 1
    for response in scrape_response_list:
        content = response.get("corpus", "")
        table_content = response.get("tables_raw", {})

        content_list.append(content)
        try:
            for table_key in table_content.keys():
                tables_dict[str(table_count)] = table_content[table_key]
                table_count += 1

        except AttributeError:
            pass

    content_str = "\n".join(content_list)

    return content_str, tables_dict

def split_pdf(file_contents, file_name, pages_per_chunk):
    file_bytes = io.BytesIO(file_contents)
    reader = PdfReader(file_bytes)
    total_pages = len(reader.pages)
    
    # Generate a unique directory for each request to avoid conflicts
    unique_dir = str(uuid.uuid4())
    output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
    os.makedirs(output_dir, exist_ok=True)

    num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk

    for i in range(num_chunks):
        writer = PdfWriter()
        start_page = i * pages_per_chunk
        end_page = min(start_page + pages_per_chunk, total_pages)

        for page_number in range(start_page, end_page):
            writer.add_page(reader.pages[page_number])

        chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
        output_path = output_dir / chunk_file_name
        with open(output_path, "wb") as output_pdf:
            writer.write(output_pdf)

    return str(output_dir)

@app.post("/process-pdf/")
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
    # Read the PDF file
    file_contents = await pdf_file.read()

    # Split the PDF into chunks
    chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
    
    # Asynchronously process the PDF chunks
    scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
    
    # Collect the results
    content, table_string = collect_pdfscraper_response(scrape_response_list)
    
    # Ensure the directory exists before attempting to delete it
    if os.path.exists(chunks_dir):
        try:
            shutil.rmtree(chunks_dir)  # Clean up chunks after processing
        except Exception as e:
            # Log any errors during cleanup
            logging.error(f"Error deleting directory {chunks_dir}: {e}")
    
    return JSONResponse(content={"content": content, "tables": table_string})

# If you want to run this locally, uncomment the lines below.
# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host="0.0.0.0", port=8000)
#uvicorn main:app --workers 2