File size: 4,314 Bytes
4c263a7 da2debc 4c263a7 1d9b15e 4c263a7 1d9b15e d16f678 d92c861 9c62372 1d9b15e 4c263a7 1d9b15e 4c263a7 1d9b15e 4c263a7 1d9b15e 4c263a7 1d9b15e 4c263a7 da2debc 4c263a7 9c62372 4c263a7 a81ff23 1d9b15e 4c263a7 1d9b15e da2debc 1d9b15e 9c62372 da2debc 1d9b15e da2debc 1d9b15e da2debc 4c263a7 1d9b15e da2debc 1d9b15e c69582b 0f0c7dc da2debc 4c263a7 1d9b15e da2debc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import io
import asyncio
import os
import uuid
import logging
from pathlib import Path
import aiohttp
from PyPDF2 import PdfReader, PdfWriter
from fastapi import FastAPI, UploadFile, Form
from fastapi.responses import JSONResponse
from aiohttp import FormData
import shutil
app = FastAPI()
async def call_pdfscraper(session, file_contents, pdf_name, processTables):
headers = {"Origin": "http://localhost:8080"}
url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
data = FormData()
data.add_field(
"pdf",
file_contents,
filename=os.path.basename(pdf_name),
content_type="application/pdf",
)
data.add_field("processTables", processTables)
async with session.post(url, data=data, headers=headers) as resp:
if resp.status == 200:
response = await resp.json()
else:
return {}, pdf_name
return response, pdf_name
async def execute_pdfscraper_async(file_path: str, processTables: str):
chunk_list = os.listdir(file_path)
chunk_byte_list = [
(open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list
]
response_list = []
async with aiohttp.ClientSession() as session:
tasks = [
call_pdfscraper(session, file_all[0], file_all[1], processTables)
for file_all in chunk_byte_list
]
responses = await asyncio.gather(*tasks)
for i, response in enumerate(responses):
response_list.append(response[0])
return response_list
def collect_pdfscraper_response(scrape_response_list):
content_list = []
tables_dict = {}
table_count = 1
for response in scrape_response_list:
content = response.get("corpus", "")
table_content = response.get("tables_raw", {})
content_list.append(content)
try:
for table_key in table_content.keys():
tables_dict[str(table_count)] = table_content[table_key]
table_count += 1
except AttributeError:
pass
content_str = "\n".join(content_list)
return content_str, tables_dict
def split_pdf(file_contents, file_name, pages_per_chunk):
file_bytes = io.BytesIO(file_contents)
reader = PdfReader(file_bytes)
total_pages = len(reader.pages)
# Generate a unique directory for each request to avoid conflicts
unique_dir = str(uuid.uuid4())
output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
os.makedirs(output_dir, exist_ok=True)
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
for i in range(num_chunks):
writer = PdfWriter()
start_page = i * pages_per_chunk
end_page = min(start_page + pages_per_chunk, total_pages)
for page_number in range(start_page, end_page):
writer.add_page(reader.pages[page_number])
chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
output_path = output_dir / chunk_file_name
with open(output_path, "wb") as output_pdf:
writer.write(output_pdf)
return str(output_dir)
@app.post("/process-pdf/")
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
# Read the PDF file
file_contents = await pdf_file.read()
# Split the PDF into chunks
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
# Asynchronously process the PDF chunks
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
# Collect the results
content, table_string = collect_pdfscraper_response(scrape_response_list)
# Ensure the directory exists before attempting to delete it
if os.path.exists(chunks_dir):
try:
shutil.rmtree(chunks_dir) # Clean up chunks after processing
except Exception as e:
# Log any errors during cleanup
logging.error(f"Error deleting directory {chunks_dir}: {e}")
return JSONResponse(content={"content": content, "tables": table_string})
# If you want to run this locally, uncomment the lines below.
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=8000)
#uvicorn main:app --workers 2 |