Spaces:

Didier
/

Docs_QA_ColBERT_DSPy

Sleeping

File size: 5,483 Bytes

1c18375

""" pdf_utils.py

Utilities for working with PDFs

:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""

import pypdf
import os
import datetime
import pytz

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def validate_pdf(file_path: str) -> bool:
    """Validate that file exists AND is a PDF file)
    """
    if not os.path.exists(file_path):
        logger.error(f"File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        logger.error("File is not a PDF")
        return False
    return True


def get_text_from_pdf(
        file_path: str,
        max_chars: int = 100_000_000
    ) -> str:
    """Extract the text from a given PDF file.
    
    Args:
        file_path: path to the PDF file
        mac_chars: max length (in chars) to be read from the file
    
    Returns:
        the extracted text.
    """
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = pypdf.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                
                # Check if adding this page's text would exceed the limit
                if total_chars + len(text) > max_chars:
                    # Only add text up to the limit
                    remaining_chars = max_chars - total_chars
                    extracted_text.append(text[:remaining_chars])
                    print(f"Reached {max_chars} character limit at page {page_num + 1}")
                    break
                
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except pypdf.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None


def get_pdf_metadata(file_path: str) -> dict:
    """Get the metadata of a given PDF file.
    
    Args:
        file_path: path to a PDF file
    
    Returns:
        dictionary woth the metadata information
    """
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = pypdf.PdfReader(file)
            metadata = {
                'num_pages': len(pdf_reader.pages),
                'metadata': pdf_reader.metadata
            }
            return metadata
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
        return None


def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
    """Extract a datetime string from the metadata of a PDF file.
    
    Args:
        metadata: dictionary with the metadata information
        key: key to extract the datetime from
    
    Returns:
        the datetime string or None if not found
    """
    if key not in metadata:
        return None

    # Extract the datetime string from data time string used in PDF metadata
    # e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
    pdf_date_string = metadata[key]

    # Remove the 'D:' prefix and the single quotes around the timezone offset
    date_string = pdf_date_string[2:]
    date_string = date_string.replace("'", "")

    # Parse the date and time components
    date_part = date_string[:8]
    time_part = date_string[8:14]
    offset_part = date_string[14:]

    # Create a datetime object
    dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")

    # Handle the timezone offset
    offset_hours = int(offset_part[1:3])
    offset_minutes = int(offset_part[3:5])
    offset = offset_hours * 60 + offset_minutes
    if offset_part.startswith('+'):
        offset = -offset

    # Create a timezone object
    timezone = pytz.FixedOffset(offset)

    # Create a timezone-aware datetime object
    dt = timezone.localize(dt)

    return dt.strftime("%Y-%m-%d %H:%M:%S")


def get_metadata_info(pdf_path: str) -> dict:
    """Build a dictionary with basic and additional information about a PDF file.
    
    Args:
        pdf_path: path to the PDF file
    
    Returns:
        dictionary with the metadata information
    """
    # basic information about the file
    metadata_info = {}
    metadata_info['file_name'] = os.path.basename(pdf_path)

    # additional information about the file
    pdf_metadata = get_pdf_metadata(pdf_path)
    if pdf_metadata:
        metadata_info['num_pages'] = pdf_metadata['num_pages']
        metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
        metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')
    
    return metadata_info