Spaces:

Didier
/

Docs_QA_ColBERT_DSPy

Running

Didier Guillevic

Initial commit

1c18375 about 2 months ago

5.48 kB

	""" pdf_utils.py

	Utilities for working with PDFs

	:author: Didier Guillevic
	:email: [email protected]
	:creation: 2024-12-21
	"""

	import pypdf
	import os
	import datetime
	import pytz

	import logging
	logger = logging.getLogger(__name__)
	logging.basicConfig(level=logging.INFO)


	def validate_pdf(file_path: str) -> bool:
	"""Validate that file exists AND is a PDF file)
	"""
	if not os.path.exists(file_path):
	logger.error(f"File not found at path: {file_path}")
	return False
	if not file_path.lower().endswith('.pdf'):
	logger.error("File is not a PDF")
	return False
	return True


	def get_text_from_pdf(
	file_path: str,
	max_chars: int = 100_000_000
	) -> str:
	"""Extract the text from a given PDF file.

	Args:
	file_path: path to the PDF file
	mac_chars: max length (in chars) to be read from the file

	Returns:
	the extracted text.
	"""
	if not validate_pdf(file_path):
	return None

	try:
	with open(file_path, 'rb') as file:
	# Create PDF reader object
	pdf_reader = pypdf.PdfReader(file)

	# Get total number of pages
	num_pages = len(pdf_reader.pages)
	print(f"Processing PDF with {num_pages} pages...")

	extracted_text = []
	total_chars = 0

	# Iterate through all pages
	for page_num in range(num_pages):
	# Extract text from page
	page = pdf_reader.pages[page_num]
	text = page.extract_text()

	# Check if adding this page's text would exceed the limit
	if total_chars + len(text) > max_chars:
	# Only add text up to the limit
	remaining_chars = max_chars - total_chars
	extracted_text.append(text[:remaining_chars])
	print(f"Reached {max_chars} character limit at page {page_num + 1}")
	break

	extracted_text.append(text)
	total_chars += len(text)
	print(f"Processed page {page_num + 1}/{num_pages}")

	final_text = '\n'.join(extracted_text)
	print(f"\nExtraction complete! Total characters: {len(final_text)}")
	return final_text

	except pypdf.PdfReadError:
	print("Error: Invalid or corrupted PDF file")
	return None
	except Exception as e:
	print(f"An unexpected error occurred: {str(e)}")
	return None


	def get_pdf_metadata(file_path: str) -> dict:
	"""Get the metadata of a given PDF file.

	Args:
	file_path: path to a PDF file

	Returns:
	dictionary woth the metadata information
	"""
	if not validate_pdf(file_path):
	return None

	try:
	with open(file_path, 'rb') as file:
	pdf_reader = pypdf.PdfReader(file)
	metadata = {
	'num_pages': len(pdf_reader.pages),
	'metadata': pdf_reader.metadata
	}
	return metadata
	except Exception as e:
	print(f"Error extracting metadata: {str(e)}")
	return None


	def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
	"""Extract a datetime string from the metadata of a PDF file.

	Args:
	metadata: dictionary with the metadata information
	key: key to extract the datetime from

	Returns:
	the datetime string or None if not found
	"""
	if key not in metadata:
	return None

	# Extract the datetime string from data time string used in PDF metadata
	# e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
	pdf_date_string = metadata[key]

	# Remove the 'D:' prefix and the single quotes around the timezone offset
	date_string = pdf_date_string[2:]
	date_string = date_string.replace("'", "")

	# Parse the date and time components
	date_part = date_string[:8]
	time_part = date_string[8:14]
	offset_part = date_string[14:]

	# Create a datetime object
	dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")

	# Handle the timezone offset
	offset_hours = int(offset_part[1:3])
	offset_minutes = int(offset_part[3:5])
	offset = offset_hours * 60 + offset_minutes
	if offset_part.startswith('+'):
	offset = -offset

	# Create a timezone object
	timezone = pytz.FixedOffset(offset)

	# Create a timezone-aware datetime object
	dt = timezone.localize(dt)

	return dt.strftime("%Y-%m-%d %H:%M:%S")


	def get_metadata_info(pdf_path: str) -> dict:
	"""Build a dictionary with basic and additional information about a PDF file.

	Args:
	pdf_path: path to the PDF file

	Returns:
	dictionary with the metadata information
	"""
	# basic information about the file
	metadata_info = {}
	metadata_info['file_name'] = os.path.basename(pdf_path)

	# additional information about the file
	pdf_metadata = get_pdf_metadata(pdf_path)
	if pdf_metadata:
	metadata_info['num_pages'] = pdf_metadata['num_pages']
	metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
	metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')

	return metadata_info