|
try: from pip._internal.operations import freeze |
|
except ImportError: |
|
from pip.operations import freeze |
|
|
|
pkgs = freeze.freeze() |
|
for pkg in pkgs: print(pkg) |
|
|
|
import io |
|
import asyncio |
|
import time |
|
import aiohttp |
|
from PyPDF2 import PdfReader, PdfWriter |
|
import os |
|
from pathlib import Path |
|
from aiohttp import FormData |
|
from fastapi import FastAPI, File, UploadFile |
|
from fastapi.responses import JSONResponse |
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
|
app = FastAPI() |
|
|
|
|
|
app.add_middleware( |
|
CORSMiddleware, |
|
allow_origins=["*"], |
|
allow_credentials=True, |
|
allow_methods=["*"], |
|
allow_headers=["*"], |
|
) |
|
|
|
async def execute_pdfscraper_async(file_contents: bytes, file_name: str, pages_per_chunk: int): |
|
split_pdf(file_contents, file_name, pages_per_chunk) |
|
response_list = [] |
|
async with aiohttp.ClientSession() as session: |
|
tasks = [ |
|
call_pdfscraper(session, chunk_data, chunk_name) |
|
for chunk_data, chunk_name in load_chunks(file_name) |
|
] |
|
responses = await asyncio.gather(*tasks) |
|
for response in responses: |
|
response_list.append(response[0]) |
|
|
|
return response_list |
|
|
|
async def call_pdfscraper(session, file_contents, pdf_name): |
|
headers = {"Origin": "http://localhost:8080"} |
|
url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" |
|
|
|
data = FormData() |
|
data.add_field( |
|
"pdf", |
|
file_contents, |
|
filename=os.path.basename(pdf_name), |
|
content_type="application/pdf", |
|
) |
|
data.add_field("processTables", "True") |
|
|
|
async with session.post(url, data=data, headers=headers) as resp: |
|
if resp.status == 200: |
|
response = await resp.json() |
|
else: |
|
print(f"Failed to get response: {resp.status}") |
|
return {} |
|
|
|
return response, pdf_name |
|
|
|
def collect_pdfscraper_response(scrape_response_list): |
|
content_list = [] |
|
tables_dict = {} |
|
table_count = 1 |
|
for response in scrape_response_list: |
|
content = response["corpus"] |
|
table_content = response["tables_raw"] |
|
|
|
content_list.append(content) |
|
try: |
|
for table_key in table_content.keys(): |
|
tables_dict[str(table_count)] = table_content[table_key] |
|
table_count += 1 |
|
|
|
except AttributeError: |
|
pass |
|
|
|
content_str = "\n".join(content_list) |
|
|
|
return content_str, tables_dict |
|
|
|
def split_pdf(file_contents, file_name, pages_per_chunk): |
|
file_bytes = io.BytesIO(file_contents) |
|
reader = PdfReader(file_bytes) |
|
total_pages = len(reader.pages) |
|
|
|
output_dir = Path(file_name).parent / "chunks" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk |
|
|
|
for i in range(num_chunks): |
|
writer = PdfWriter() |
|
start_page = i * pages_per_chunk |
|
end_page = min(start_page + pages_per_chunk, total_pages) |
|
|
|
for page_number in range(start_page, end_page): |
|
writer.add_page(reader.pages[page_number]) |
|
|
|
chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" |
|
output_path = output_dir / chunk_file_name |
|
with open(output_path, "wb") as output_pdf: |
|
writer.write(output_pdf) |
|
|
|
def load_chunks(file_name): |
|
output_dir = Path(file_name).parent / "chunks" |
|
chunk_list = os.listdir(output_dir) |
|
chunk_byte_list = [ |
|
(open(f"{output_dir}/{file}", "rb").read(), file) for file in chunk_list |
|
] |
|
return chunk_byte_list |
|
|
|
@app.post("/process-pdf/") |
|
async def process_pdf(file: UploadFile = File(...), pages_per_chunk: int = 2): |
|
file_contents = await file.read() |
|
file_name = file.filename |
|
|
|
start_time = time.time() |
|
scrape_response_list = await execute_pdfscraper_async(file_contents, file_name, pages_per_chunk) |
|
content, table_string = collect_pdfscraper_response(scrape_response_list) |
|
end_time = time.time() |
|
|
|
time_taken = end_time - start_time |
|
return JSONResponse(content={"content": content, "tables": table_string, "time_taken": time_taken}) |
|
|
|
|
|
|
|
|
|
|
|
|