Docs_QA_ColBERT_DSPy / pdf_utils.py
Didier Guillevic
Initial commit
1c18375
""" pdf_utils.py
Utilities for working with PDFs
:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""
import pypdf
import os
import datetime
import pytz
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def validate_pdf(file_path: str) -> bool:
"""Validate that file exists AND is a PDF file)
"""
if not os.path.exists(file_path):
logger.error(f"File not found at path: {file_path}")
return False
if not file_path.lower().endswith('.pdf'):
logger.error("File is not a PDF")
return False
return True
def get_text_from_pdf(
file_path: str,
max_chars: int = 100_000_000
) -> str:
"""Extract the text from a given PDF file.
Args:
file_path: path to the PDF file
mac_chars: max length (in chars) to be read from the file
Returns:
the extracted text.
"""
if not validate_pdf(file_path):
return None
try:
with open(file_path, 'rb') as file:
# Create PDF reader object
pdf_reader = pypdf.PdfReader(file)
# Get total number of pages
num_pages = len(pdf_reader.pages)
print(f"Processing PDF with {num_pages} pages...")
extracted_text = []
total_chars = 0
# Iterate through all pages
for page_num in range(num_pages):
# Extract text from page
page = pdf_reader.pages[page_num]
text = page.extract_text()
# Check if adding this page's text would exceed the limit
if total_chars + len(text) > max_chars:
# Only add text up to the limit
remaining_chars = max_chars - total_chars
extracted_text.append(text[:remaining_chars])
print(f"Reached {max_chars} character limit at page {page_num + 1}")
break
extracted_text.append(text)
total_chars += len(text)
print(f"Processed page {page_num + 1}/{num_pages}")
final_text = '\n'.join(extracted_text)
print(f"\nExtraction complete! Total characters: {len(final_text)}")
return final_text
except pypdf.PdfReadError:
print("Error: Invalid or corrupted PDF file")
return None
except Exception as e:
print(f"An unexpected error occurred: {str(e)}")
return None
def get_pdf_metadata(file_path: str) -> dict:
"""Get the metadata of a given PDF file.
Args:
file_path: path to a PDF file
Returns:
dictionary woth the metadata information
"""
if not validate_pdf(file_path):
return None
try:
with open(file_path, 'rb') as file:
pdf_reader = pypdf.PdfReader(file)
metadata = {
'num_pages': len(pdf_reader.pages),
'metadata': pdf_reader.metadata
}
return metadata
except Exception as e:
print(f"Error extracting metadata: {str(e)}")
return None
def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
"""Extract a datetime string from the metadata of a PDF file.
Args:
metadata: dictionary with the metadata information
key: key to extract the datetime from
Returns:
the datetime string or None if not found
"""
if key not in metadata:
return None
# Extract the datetime string from data time string used in PDF metadata
# e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
pdf_date_string = metadata[key]
# Remove the 'D:' prefix and the single quotes around the timezone offset
date_string = pdf_date_string[2:]
date_string = date_string.replace("'", "")
# Parse the date and time components
date_part = date_string[:8]
time_part = date_string[8:14]
offset_part = date_string[14:]
# Create a datetime object
dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
# Handle the timezone offset
offset_hours = int(offset_part[1:3])
offset_minutes = int(offset_part[3:5])
offset = offset_hours * 60 + offset_minutes
if offset_part.startswith('+'):
offset = -offset
# Create a timezone object
timezone = pytz.FixedOffset(offset)
# Create a timezone-aware datetime object
dt = timezone.localize(dt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
def get_metadata_info(pdf_path: str) -> dict:
"""Build a dictionary with basic and additional information about a PDF file.
Args:
pdf_path: path to the PDF file
Returns:
dictionary with the metadata information
"""
# basic information about the file
metadata_info = {}
metadata_info['file_name'] = os.path.basename(pdf_path)
# additional information about the file
pdf_metadata = get_pdf_metadata(pdf_path)
if pdf_metadata:
metadata_info['num_pages'] = pdf_metadata['num_pages']
metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')
return metadata_info