Spaces:
Sleeping
Sleeping
File size: 5,483 Bytes
1c18375 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
""" pdf_utils.py
Utilities for working with PDFs
:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""
import pypdf
import os
import datetime
import pytz
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def validate_pdf(file_path: str) -> bool:
"""Validate that file exists AND is a PDF file)
"""
if not os.path.exists(file_path):
logger.error(f"File not found at path: {file_path}")
return False
if not file_path.lower().endswith('.pdf'):
logger.error("File is not a PDF")
return False
return True
def get_text_from_pdf(
file_path: str,
max_chars: int = 100_000_000
) -> str:
"""Extract the text from a given PDF file.
Args:
file_path: path to the PDF file
mac_chars: max length (in chars) to be read from the file
Returns:
the extracted text.
"""
if not validate_pdf(file_path):
return None
try:
with open(file_path, 'rb') as file:
# Create PDF reader object
pdf_reader = pypdf.PdfReader(file)
# Get total number of pages
num_pages = len(pdf_reader.pages)
print(f"Processing PDF with {num_pages} pages...")
extracted_text = []
total_chars = 0
# Iterate through all pages
for page_num in range(num_pages):
# Extract text from page
page = pdf_reader.pages[page_num]
text = page.extract_text()
# Check if adding this page's text would exceed the limit
if total_chars + len(text) > max_chars:
# Only add text up to the limit
remaining_chars = max_chars - total_chars
extracted_text.append(text[:remaining_chars])
print(f"Reached {max_chars} character limit at page {page_num + 1}")
break
extracted_text.append(text)
total_chars += len(text)
print(f"Processed page {page_num + 1}/{num_pages}")
final_text = '\n'.join(extracted_text)
print(f"\nExtraction complete! Total characters: {len(final_text)}")
return final_text
except pypdf.PdfReadError:
print("Error: Invalid or corrupted PDF file")
return None
except Exception as e:
print(f"An unexpected error occurred: {str(e)}")
return None
def get_pdf_metadata(file_path: str) -> dict:
"""Get the metadata of a given PDF file.
Args:
file_path: path to a PDF file
Returns:
dictionary woth the metadata information
"""
if not validate_pdf(file_path):
return None
try:
with open(file_path, 'rb') as file:
pdf_reader = pypdf.PdfReader(file)
metadata = {
'num_pages': len(pdf_reader.pages),
'metadata': pdf_reader.metadata
}
return metadata
except Exception as e:
print(f"Error extracting metadata: {str(e)}")
return None
def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
"""Extract a datetime string from the metadata of a PDF file.
Args:
metadata: dictionary with the metadata information
key: key to extract the datetime from
Returns:
the datetime string or None if not found
"""
if key not in metadata:
return None
# Extract the datetime string from data time string used in PDF metadata
# e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
pdf_date_string = metadata[key]
# Remove the 'D:' prefix and the single quotes around the timezone offset
date_string = pdf_date_string[2:]
date_string = date_string.replace("'", "")
# Parse the date and time components
date_part = date_string[:8]
time_part = date_string[8:14]
offset_part = date_string[14:]
# Create a datetime object
dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")
# Handle the timezone offset
offset_hours = int(offset_part[1:3])
offset_minutes = int(offset_part[3:5])
offset = offset_hours * 60 + offset_minutes
if offset_part.startswith('+'):
offset = -offset
# Create a timezone object
timezone = pytz.FixedOffset(offset)
# Create a timezone-aware datetime object
dt = timezone.localize(dt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
def get_metadata_info(pdf_path: str) -> dict:
"""Build a dictionary with basic and additional information about a PDF file.
Args:
pdf_path: path to the PDF file
Returns:
dictionary with the metadata information
"""
# basic information about the file
metadata_info = {}
metadata_info['file_name'] = os.path.basename(pdf_path)
# additional information about the file
pdf_metadata = get_pdf_metadata(pdf_path)
if pdf_metadata:
metadata_info['num_pages'] = pdf_metadata['num_pages']
metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')
return metadata_info
|