Spaces:
Sleeping
Sleeping
File size: 2,091 Bytes
c6907ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from langchain_community.document_loaders import PyMuPDFLoader
import fitz
import os
import requests
from utilities.debugger import dprint
import uuid
def download_document(app_state, url, file_name, download_folder):
file_path = os.path.join(download_folder, file_name)
if not os.path.exists(download_folder):
os.makedirs(download_folder)
if not os.path.exists(file_path):
print(f"Downloading {file_name} from {url}...")
response = requests.get(url)
if response.status_code == 200:
with open(file_path, 'wb') as f:
f.write(response.content)
else:
dprint(app_state, f"Failed to download document from {url}. Status code: {response.status_code}")
else:
dprint(app_state, f"{file_name} already exists locally.")
return file_path
def get_documents(app_state):
for url in app_state.document_urls:
dprint(app_state, f"Downloading and loading document from {url}...")
file_name = url.split("/")[-1]
file_path = download_document(app_state, url, file_name, app_state.download_folder)
loader = PyMuPDFLoader(file_path)
loaded_document = loader.load()
single_text_document = "\n".join([doc.page_content for doc in loaded_document])
dprint(app_state, f"Number of pages: {len(loaded_document)}")
# lets get titles and metadata
pdf = fitz.open(file_path)
metadata = pdf.metadata
title = metadata.get('title', 'Document 1')
document = {
"url": url,
"title": title,
"metadata": metadata,
"loaded_document": loaded_document,
"single_text_document": single_text_document,
"document_id": str(uuid.uuid4())
}
app_state.add_document(document)
dprint(app_state, f"Title of Document: {title}")
dprint(app_state, f"Full metadata for Document 1: {metadata}")
pdf.close()
print(f"Total documents: {len(app_state.documents)}")
|