from langchain_community.document_loaders import PyMuPDFLoader import fitz import os import requests from utilities.debugger import dprint import uuid def download_document(app_state, url, file_name, download_folder): file_path = os.path.join(download_folder, file_name) if not os.path.exists(download_folder): os.makedirs(download_folder) if not os.path.exists(file_path): print(f"Downloading {file_name} from {url}...") response = requests.get(url) if response.status_code == 200: with open(file_path, 'wb') as f: f.write(response.content) else: dprint(app_state, f"Failed to download document from {url}. Status code: {response.status_code}") else: dprint(app_state, f"{file_name} already exists locally.") return file_path def get_documents(app_state): for url in app_state.document_urls: dprint(app_state, f"Downloading and loading document from {url}...") file_name = url.split("/")[-1] file_path = download_document(app_state, url, file_name, app_state.download_folder) loader = PyMuPDFLoader(file_path) loaded_document = loader.load() single_text_document = "\n".join([doc.page_content for doc in loaded_document]) dprint(app_state, f"Number of pages: {len(loaded_document)}") # lets get titles and metadata pdf = fitz.open(file_path) metadata = pdf.metadata title = metadata.get('title', 'Document 1') document = { "url": url, "title": title, "metadata": metadata, "loaded_document": loaded_document, "single_text_document": single_text_document, "document_id": str(uuid.uuid4()) } app_state.add_document(document) dprint(app_state, f"Title of Document: {title}") dprint(app_state, f"Full metadata for Document 1: {metadata}") pdf.close() print(f"Total documents: {len(app_state.documents)}")