Spaces:
Running
Running
""" pdf_utils.py | |
Utilities for working with PDFs | |
:author: Didier Guillevic | |
:email: [email protected] | |
:creation: 2024-12-21 | |
""" | |
import pypdf | |
import os | |
import datetime | |
import pytz | |
import logging | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
def validate_pdf(file_path: str) -> bool: | |
"""Validate that file exists AND is a PDF file) | |
""" | |
if not os.path.exists(file_path): | |
logger.error(f"File not found at path: {file_path}") | |
return False | |
if not file_path.lower().endswith('.pdf'): | |
logger.error("File is not a PDF") | |
return False | |
return True | |
def get_text_from_pdf( | |
file_path: str, | |
max_chars: int = 100_000_000 | |
) -> str: | |
"""Extract the text from a given PDF file. | |
Args: | |
file_path: path to the PDF file | |
mac_chars: max length (in chars) to be read from the file | |
Returns: | |
the extracted text. | |
""" | |
if not validate_pdf(file_path): | |
return None | |
try: | |
with open(file_path, 'rb') as file: | |
# Create PDF reader object | |
pdf_reader = pypdf.PdfReader(file) | |
# Get total number of pages | |
num_pages = len(pdf_reader.pages) | |
print(f"Processing PDF with {num_pages} pages...") | |
extracted_text = [] | |
total_chars = 0 | |
# Iterate through all pages | |
for page_num in range(num_pages): | |
# Extract text from page | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
# Check if adding this page's text would exceed the limit | |
if total_chars + len(text) > max_chars: | |
# Only add text up to the limit | |
remaining_chars = max_chars - total_chars | |
extracted_text.append(text[:remaining_chars]) | |
print(f"Reached {max_chars} character limit at page {page_num + 1}") | |
break | |
extracted_text.append(text) | |
total_chars += len(text) | |
print(f"Processed page {page_num + 1}/{num_pages}") | |
final_text = '\n'.join(extracted_text) | |
print(f"\nExtraction complete! Total characters: {len(final_text)}") | |
return final_text | |
except pypdf.PdfReadError: | |
print("Error: Invalid or corrupted PDF file") | |
return None | |
except Exception as e: | |
print(f"An unexpected error occurred: {str(e)}") | |
return None | |
def get_pdf_metadata(file_path: str) -> dict: | |
"""Get the metadata of a given PDF file. | |
Args: | |
file_path: path to a PDF file | |
Returns: | |
dictionary woth the metadata information | |
""" | |
if not validate_pdf(file_path): | |
return None | |
try: | |
with open(file_path, 'rb') as file: | |
pdf_reader = pypdf.PdfReader(file) | |
metadata = { | |
'num_pages': len(pdf_reader.pages), | |
'metadata': pdf_reader.metadata | |
} | |
return metadata | |
except Exception as e: | |
print(f"Error extracting metadata: {str(e)}") | |
return None | |
def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str: | |
"""Extract a datetime string from the metadata of a PDF file. | |
Args: | |
metadata: dictionary with the metadata information | |
key: key to extract the datetime from | |
Returns: | |
the datetime string or None if not found | |
""" | |
if key not in metadata: | |
return None | |
# Extract the datetime string from data time string used in PDF metadata | |
# e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00" | |
pdf_date_string = metadata[key] | |
# Remove the 'D:' prefix and the single quotes around the timezone offset | |
date_string = pdf_date_string[2:] | |
date_string = date_string.replace("'", "") | |
# Parse the date and time components | |
date_part = date_string[:8] | |
time_part = date_string[8:14] | |
offset_part = date_string[14:] | |
# Create a datetime object | |
dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S") | |
# Handle the timezone offset | |
offset_hours = int(offset_part[1:3]) | |
offset_minutes = int(offset_part[3:5]) | |
offset = offset_hours * 60 + offset_minutes | |
if offset_part.startswith('+'): | |
offset = -offset | |
# Create a timezone object | |
timezone = pytz.FixedOffset(offset) | |
# Create a timezone-aware datetime object | |
dt = timezone.localize(dt) | |
return dt.strftime("%Y-%m-%d %H:%M:%S") | |
def get_metadata_info(pdf_path: str) -> dict: | |
"""Build a dictionary with basic and additional information about a PDF file. | |
Args: | |
pdf_path: path to the PDF file | |
Returns: | |
dictionary with the metadata information | |
""" | |
# basic information about the file | |
metadata_info = {} | |
metadata_info['file_name'] = os.path.basename(pdf_path) | |
# additional information about the file | |
pdf_metadata = get_pdf_metadata(pdf_path) | |
if pdf_metadata: | |
metadata_info['num_pages'] = pdf_metadata['num_pages'] | |
metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate') | |
metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate') | |
return metadata_info | |