""" pdf_utils.py Utilities for working with PDFs :author: Didier Guillevic :email: didier@guillevic.net :creation: 2024-12-21 """ import pypdf import os import datetime import pytz import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) def validate_pdf(file_path: str) -> bool: """Validate that file exists AND is a PDF file) """ if not os.path.exists(file_path): logger.error(f"File not found at path: {file_path}") return False if not file_path.lower().endswith('.pdf'): logger.error("File is not a PDF") return False return True def get_text_from_pdf( file_path: str, max_chars: int = 100_000_000 ) -> str: """Extract the text from a given PDF file. Args: file_path: path to the PDF file mac_chars: max length (in chars) to be read from the file Returns: the extracted text. """ if not validate_pdf(file_path): return None try: with open(file_path, 'rb') as file: # Create PDF reader object pdf_reader = pypdf.PdfReader(file) # Get total number of pages num_pages = len(pdf_reader.pages) print(f"Processing PDF with {num_pages} pages...") extracted_text = [] total_chars = 0 # Iterate through all pages for page_num in range(num_pages): # Extract text from page page = pdf_reader.pages[page_num] text = page.extract_text() # Check if adding this page's text would exceed the limit if total_chars + len(text) > max_chars: # Only add text up to the limit remaining_chars = max_chars - total_chars extracted_text.append(text[:remaining_chars]) print(f"Reached {max_chars} character limit at page {page_num + 1}") break extracted_text.append(text) total_chars += len(text) print(f"Processed page {page_num + 1}/{num_pages}") final_text = '\n'.join(extracted_text) print(f"\nExtraction complete! Total characters: {len(final_text)}") return final_text except pypdf.PdfReadError: print("Error: Invalid or corrupted PDF file") return None except Exception as e: print(f"An unexpected error occurred: {str(e)}") return None def get_pdf_metadata(file_path: str) -> dict: """Get the metadata of a given PDF file. Args: file_path: path to a PDF file Returns: dictionary woth the metadata information """ if not validate_pdf(file_path): return None try: with open(file_path, 'rb') as file: pdf_reader = pypdf.PdfReader(file) metadata = { 'num_pages': len(pdf_reader.pages), 'metadata': pdf_reader.metadata } return metadata except Exception as e: print(f"Error extracting metadata: {str(e)}") return None def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str: """Extract a datetime string from the metadata of a PDF file. Args: metadata: dictionary with the metadata information key: key to extract the datetime from Returns: the datetime string or None if not found """ if key not in metadata: return None # Extract the datetime string from data time string used in PDF metadata # e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00" pdf_date_string = metadata[key] # Remove the 'D:' prefix and the single quotes around the timezone offset date_string = pdf_date_string[2:] date_string = date_string.replace("'", "") # Parse the date and time components date_part = date_string[:8] time_part = date_string[8:14] offset_part = date_string[14:] # Create a datetime object dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S") # Handle the timezone offset offset_hours = int(offset_part[1:3]) offset_minutes = int(offset_part[3:5]) offset = offset_hours * 60 + offset_minutes if offset_part.startswith('+'): offset = -offset # Create a timezone object timezone = pytz.FixedOffset(offset) # Create a timezone-aware datetime object dt = timezone.localize(dt) return dt.strftime("%Y-%m-%d %H:%M:%S") def get_metadata_info(pdf_path: str) -> dict: """Build a dictionary with basic and additional information about a PDF file. Args: pdf_path: path to the PDF file Returns: dictionary with the metadata information """ # basic information about the file metadata_info = {} metadata_info['file_name'] = os.path.basename(pdf_path) # additional information about the file pdf_metadata = get_pdf_metadata(pdf_path) if pdf_metadata: metadata_info['num_pages'] = pdf_metadata['num_pages'] metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate') metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate') return metadata_info