Spaces:
Sleeping
Sleeping
File size: 1,286 Bytes
4834106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from langchain_community.document_loaders import PyPDFLoader
import os
from typing import List
class PDFProcessor:
"""
Class for processing PDF files to extract text content.
"""
def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
"""
Extract text content from a list of PDF files.
Args:
file_paths (List[str]): A list of file paths to the PDF documents.
Returns:
List[str]: A list of text content extracted from the PDF documents.
"""
texts = []
for file_path in file_paths:
try:
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
for page in pages:
if isinstance(page.page_content, bytes):
text = page.page_content.decode('utf-8', errors='ignore')
elif isinstance(page.page_content, str):
text = page.page_content
else:
print(f"Unexpected type: {type(page.page_content)}")
continue
texts.append(text)
except Exception as e:
print(f"Failed to process {file_path}: {e}")
return texts |