File size: 5,483 Bytes
1c18375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
""" pdf_utils.py

Utilities for working with PDFs

:author: Didier Guillevic
:email: [email protected]
:creation: 2024-12-21
"""

import pypdf
import os
import datetime
import pytz

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def validate_pdf(file_path: str) -> bool:
    """Validate that file exists AND is a PDF file)
    """
    if not os.path.exists(file_path):
        logger.error(f"File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        logger.error("File is not a PDF")
        return False
    return True


def get_text_from_pdf(
        file_path: str,
        max_chars: int = 100_000_000
    ) -> str:
    """Extract the text from a given PDF file.
    
    Args:
        file_path: path to the PDF file
        mac_chars: max length (in chars) to be read from the file
    
    Returns:
        the extracted text.
    """
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = pypdf.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                
                # Check if adding this page's text would exceed the limit
                if total_chars + len(text) > max_chars:
                    # Only add text up to the limit
                    remaining_chars = max_chars - total_chars
                    extracted_text.append(text[:remaining_chars])
                    print(f"Reached {max_chars} character limit at page {page_num + 1}")
                    break
                
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except pypdf.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None


def get_pdf_metadata(file_path: str) -> dict:
    """Get the metadata of a given PDF file.
    
    Args:
        file_path: path to a PDF file
    
    Returns:
        dictionary woth the metadata information
    """
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = pypdf.PdfReader(file)
            metadata = {
                'num_pages': len(pdf_reader.pages),
                'metadata': pdf_reader.metadata
            }
            return metadata
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
        return None


def get_datetime_from_pdf_metadata(metadata: dict, key: str) -> str:
    """Extract a datetime string from the metadata of a PDF file.
    
    Args:
        metadata: dictionary with the metadata information
        key: key to extract the datetime from
    
    Returns:
        the datetime string or None if not found
    """
    if key not in metadata:
        return None

    # Extract the datetime string from data time string used in PDF metadata
    # e.g. "D:20210714143000+02'00'" -> "2021-07-14 14:30:00"
    pdf_date_string = metadata[key]

    # Remove the 'D:' prefix and the single quotes around the timezone offset
    date_string = pdf_date_string[2:]
    date_string = date_string.replace("'", "")

    # Parse the date and time components
    date_part = date_string[:8]
    time_part = date_string[8:14]
    offset_part = date_string[14:]

    # Create a datetime object
    dt = datetime.datetime.strptime(date_part + time_part, "%Y%m%d%H%M%S")

    # Handle the timezone offset
    offset_hours = int(offset_part[1:3])
    offset_minutes = int(offset_part[3:5])
    offset = offset_hours * 60 + offset_minutes
    if offset_part.startswith('+'):
        offset = -offset

    # Create a timezone object
    timezone = pytz.FixedOffset(offset)

    # Create a timezone-aware datetime object
    dt = timezone.localize(dt)

    return dt.strftime("%Y-%m-%d %H:%M:%S")


def get_metadata_info(pdf_path: str) -> dict:
    """Build a dictionary with basic and additional information about a PDF file.
    
    Args:
        pdf_path: path to the PDF file
    
    Returns:
        dictionary with the metadata information
    """
    # basic information about the file
    metadata_info = {}
    metadata_info['file_name'] = os.path.basename(pdf_path)

    # additional information about the file
    pdf_metadata = get_pdf_metadata(pdf_path)
    if pdf_metadata:
        metadata_info['num_pages'] = pdf_metadata['num_pages']
        metadata_info['creation_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/CreationDate')
        metadata_info['modification_date'] = get_datetime_from_pdf_metadata(pdf_metadata['metadata'], '/ModDate')
    
    return metadata_info