try: from pip._internal.operations import freeze except ImportError: # pip < 10.0 from pip.operations import freeze pkgs = freeze.freeze() for pkg in pkgs: print(pkg) import io import asyncio import time import aiohttp from PyPDF2 import PdfReader, PdfWriter import os from pathlib import Path from aiohttp import FormData from fastapi import FastAPI, File, UploadFile from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware app = FastAPI() # Configure CORS app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int): split_pdf(file_contents, file_name, pages_per_chunk) response_list = [] async with aiohttp.ClientSession() as session: tasks = [ call_pdfscraper(session, chunk_data, chunk_name) for chunk_data, chunk_name in load_chunks(file_name) ] responses = await asyncio.gather(*tasks) for response in responses: response_list.append(response[0]) return response_list async def call_pdfscraper(session, file_contents, pdf_name): headers = {"Origin": "http://localhost:8080"} url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" # Create a FormData object data = FormData() data.add_field( "pdf", file_contents, filename=os.path.basename(pdf_name), content_type="application/pdf", ) data.add_field("processTables", "True") async with session.post(url, data=data, headers=headers) as resp: if resp.status == 200: response = await resp.json() else: print(f"Failed to get response: {resp.status}") return {} return response, pdf_name def collect_pdfscraper_response(scrape_response_list): content_list = [] tables_dict = {} table_count = 1 for response in scrape_response_list: content = response["corpus"] table_content = response["tables_raw"] content_list.append(content) try: for table_key in table_content.keys(): tables_dict[str(table_count)] = table_content[table_key] table_count += 1 except AttributeError: pass content_str = "\n".join(content_list) return content_str, tables_dict def split_pdf(file_contents, file_name, pages_per_chunk): file_bytes = io.BytesIO(file_contents) reader = PdfReader(file_bytes) total_pages = len(reader.pages) output_dir = Path(file_name).parent / "chunks" os.makedirs(output_dir, exist_ok=True) num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk for i in range(num_chunks): writer = PdfWriter() start_page = i * pages_per_chunk end_page = min(start_page + pages_per_chunk, total_pages) for page_number in range(start_page, end_page): writer.add_page(reader.pages[page_number]) chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" output_path = output_dir / chunk_file_name with open(output_path, "wb") as output_pdf: writer.write(output_pdf) def load_chunks(file_name): output_dir = Path(file_name).parent / "chunks" chunk_list = os.listdir(output_dir) chunk_byte_list = [ (open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list ] return chunk_byte_list @app.post("/process-pdf/") async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2): file_contents = await file.read() file_name = file.filename start_time = time.time() scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk) content, table_string = collect_pdfscraper_response(scrape_response_list) end_time = time.time() time_taken = end_time - start_time return JSONResponse(content={"content": content, "tables": table_string, "time_taken": time_taken}) # Start the FastAPI app # if __name__ == "__main__": # import uvicorn # uvicorn.run(app,port=7000,workers=2)